From 695d9e1833cf0823965d9becdb9621cc552c2039 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 9 May 2025 08:54:15 +0200 Subject: [PATCH 01/56] api change --- admin-api-lib/openapi.yaml | 85 ++-- .../src/admin_api_lib/apis/admin_api.py | 84 ++-- .../src/admin_api_lib/apis/admin_api_base.py | 43 +- .../openapi_client/__init__.py | 53 +-- .../openapi_client/api/__init__.py | 5 +- .../openapi_client/api/extractor_api.py | 372 ++++-------------- .../openapi_client/api_client.py | 313 ++++++++++----- .../openapi_client/api_response.py | 11 +- .../openapi_client/configuration.py | 108 +++-- .../openapi_client/exceptions.py | 40 +- .../openapi_client/models/__init__.py | 29 +- .../openapi_client/models/content_type.py | 24 +- .../models/extraction_request.py | 53 ++- .../models/information_piece.py | 56 ++- .../openapi_client/models/key_value_pair.py | 40 +- .../openapi_client/rest.py | 118 ++++-- .../openapi_client/test/test_content_type.py | 33 ++ .../test/test_extraction_request.py | 56 +++ .../openapi_client/test/test_extractor_api.py | 37 ++ .../test/test_information_piece.py | 60 +++ .../test/test_key_value_pair.py | 52 +++ .../admin_api_lib/models/document_status.py | 40 +- .../src/admin_api_lib/models/extra_models.py | 1 - .../admin_api_lib/models/key_value_pair.py | 102 +++++ .../src/admin_api_lib/models/status.py | 27 +- .../src/admin_api_lib/models/upload_source.py | 102 +++++ extractor-api-lib/openapi.yaml | 96 +---- .../extractor_api_lib/apis/extractor_api.py | 78 ++-- .../apis/extractor_api_base.py | 49 +-- .../extractor_api_lib/models/content_type.py | 25 +- .../extractor_api_lib/models/extra_models.py | 1 - .../models/extraction_request.py | 54 ++- .../models/information_piece.py | 51 ++- .../models/key_value_pair.py | 43 +- rag-core-api/src/rag_core_api/apis/rag_api.py | 35 +- .../src/rag_core_api/apis/rag_api_base.py | 13 +- .../src/rag_core_api/models/chat_history.py | 49 ++- .../models/chat_history_message.py | 40 +- .../src/rag_core_api/models/chat_request.py | 47 +-- .../src/rag_core_api/models/chat_response.py | 53 ++- .../src/rag_core_api/models/chat_role.py | 23 +- .../src/rag_core_api/models/content_type.py | 27 +- .../src/rag_core_api/models/delete_request.py | 49 ++- .../src/rag_core_api/models/extra_models.py | 1 - .../rag_core_api/models/information_piece.py | 55 ++- .../src/rag_core_api/models/key_value_pair.py | 37 +- 46 files changed, 1527 insertions(+), 1243 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py create mode 100644 admin-api-lib/src/admin_api_lib/models/key_value_pair.py create mode 100644 admin-api-lib/src/admin_api_lib/models/upload_source.py diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index c1b8afe..1b8255a 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -47,29 +47,6 @@ paths: description: Internal server error tags: - admin - /upload_documents: - post: - description: Uploads user selected pdf documents. - operationId: upload_documents_post - requestBody: - content: - application/pdf: - schema: - format: binary - type: string - description: The PDF document to upload. - required: true - responses: - "200": - description: ok - "400": - description: Bad request - "422": - description: If no text has been extracted from the file. - "500": - description: Internal server error - tags: - - admin /delete_document/{identification}: delete: operationId: delete_document @@ -104,22 +81,28 @@ paths: description: Internal server error tags: - admin - /load_confluence: + /upload_source: post: + description: Uploads user selected pdf documents. + operationId: upload_source + requestBody: + content: + application/pdf: + schema: + $ref: '#/components/schemas/upload_source' + description: The PDF document to upload. + required: true responses: "200": - description: Loading from confluence is successful - "423": - description: "if the confluence loader is already processing a request,\ - \ no further requests are possible. The user needs to wait, till the preliminary\ - \ request finished processing." + description: ok + "400": + description: Bad request + "422": + description: If no text has been extracted from the file. "500": - description: Internal Server Error - "501": - description: The confluence loader is not set up + description: Internal server error tags: - admin - summary: Loading confluence to the vector db components: schemas: status: @@ -148,3 +131,39 @@ components: - status title: document_status type: object + upload_source: + description: "" + properties: + file: + description: "" + format: binary + title: file + type: string + type: + description: "" + title: type + type: string + kwargs: + description: "" + items: + $ref: '#/components/schemas/key_value_pair' + title: kwargs + type: array + required: + - type + title: upload_source + type: object + key_value_pair: + description: "" + example: + value: value + key: key + properties: + key: + description: "" + title: Key + value: + description: "" + title: Value + title: MetaInformationPiece + type: object diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 16efc4b..622cd5a 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -2,14 +2,36 @@ # coding: utf-8 +from typing import Dict, List # noqa: F401 import importlib import pkgutil +from admin_api_lib.apis.admin_api_base import BaseAdminApi from fastapi import APIRouter, Path, Request, Response, UploadFile # noqa: F401 import admin_api_lib.impl -from admin_api_lib.apis.admin_api_base import BaseAdminApi + +from fastapi import ( # noqa: F401 + APIRouter, + Body, + Cookie, + Depends, + Form, + Header, + HTTPException, + Path, + Query, + Response, + Security, + status, +) + +from admin_api_lib.models.extra_models import TokenModel # noqa: F401 +from pydantic import Field, StrictBytes, StrictStr +from typing import Any, List, Tuple, Union +from typing_extensions import Annotated from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.upload_source import UploadSource router = APIRouter() @@ -43,6 +65,8 @@ async def delete_document( ------- None """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().delete_document(identification) @@ -73,6 +97,8 @@ async def document_reference_id_get( Response The response object containing the document reference details. """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) @@ -94,39 +120,13 @@ async def get_all_documents_status() -> list[DocumentStatus]: list[DocumentStatus] A list containing the status of all documents. """ + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() @router.post( - "/load_confluence", - responses={ - 200: {"description": "Loading from confluence is successful"}, - 423: { - "description": ( - "if the confluence loader is already processing a request," - "no further requests are possible. The user needs to wait," - "till the preliminary request finished processing." - ) - }, - 500: {"description": "Internal Server Error"}, - 501: {"description": "The confluence loader is not set up"}, - }, - tags=["admin"], - response_model_by_alias=True, -) -async def load_confluence_post() -> None: - """ - Asynchronously loads a Confluence space. - - Returns - ------- - None - """ - return await BaseAdminApi.subclasses[0]().load_confluence_post() - - -@router.post( - "/upload_documents", + "/upload_source", responses={ 200: {"description": "ok"}, 400: {"description": "Bad request"}, @@ -136,22 +136,10 @@ async def load_confluence_post() -> None: tags=["admin"], response_model_by_alias=True, ) -async def upload_documents_post( - body: UploadFile, - request: Request, +async def upload_source( + upload_source: Annotated[UploadSource, Field(description="The source to upload.")] = Body(None, description="The source to upload."), ) -> None: - """ - Asynchronously uploads user-selected source documents. - - Parameters - ---------- - body : UploadFile - The file object containing the source documents to be uploaded. - request : Request - The request object containing metadata about the upload request. - - Returns - ------- - None - """ - return await BaseAdminApi.subclasses[0]().upload_documents_post(body, request) + """Uploads user selected source.""" + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().upload_source(upload_source) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 6d12beb..efeb120 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -1,13 +1,14 @@ -"""Module for the base AdminApi interface.""" - # coding: utf-8 -# flake8: noqa: D105 -from typing import ClassVar, Tuple # noqa: F401 +from typing import ClassVar, Dict, List, Tuple # noqa: F401 +from pydantic import Field, StrictBytes, StrictStr +from typing import Any, List, Tuple, Union +from typing_extensions import Annotated from fastapi import Request, Response, UploadFile from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.upload_source import UploadSource class BaseAdminApi: @@ -28,7 +29,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, - identification: str, + identification: StrictStr, ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -43,6 +44,7 @@ async def delete_document( None """ + async def document_reference_id_get( self, identification: str, @@ -61,6 +63,7 @@ async def document_reference_id_get( The response object containing the document reference details. """ + async def get_all_documents_status( self, ) -> list[DocumentStatus]: @@ -73,33 +76,9 @@ async def get_all_documents_status( A list containing the status of all documents. """ - async def load_confluence_post( - self, - ) -> None: - """ - Asynchronously loads a Confluence space. - Returns - ------- - None - """ - - async def upload_documents_post( + async def upload_source( self, - body: UploadFile, - request: Request, + upload_source: Annotated[UploadSource, Field(description="The PDF document to upload.")], ) -> None: - """ - Asynchronously uploads user-selected source documents. - - Parameters - ---------- - body : UploadFile - The file object containing the source documents to be uploaded. - request : Request - The request object containing metadata about the upload request. - - Returns - ------- - None - """ + ... diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py index 79a89e3..ae86262 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -3,52 +3,35 @@ # flake8: noqa """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 __version__ = "1.0.0" # import apis into sdk package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi # import ApiClient from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration, -) -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiAttributeError, - ApiException, - ApiKeyError, - ApiTypeError, - ApiValueError, - OpenApiException, -) +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException # import models into sdk package -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py index 13a312f..792725e 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -1,6 +1,5 @@ # flake8: noqa # import apis into api package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py index f1fddba..e4a0fa6 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -1,36 +1,27 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 import warnings +from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt from typing import Any, Dict, List, Optional, Tuple, Union - -from pydantic import Field, StrictFloat, StrictInt, StrictStr, validate_call from typing_extensions import Annotated -from admin_api_lib.extractor_api_client.openapi_client.api_client import ( - ApiClient, - RequestSerialized, -) +from typing import List +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType @@ -46,244 +37,6 @@ def __init__(self, api_client=None) -> None: api_client = ApiClient.get_default() self.api_client = api_client - @validate_call - def extract_from_confluence_post( - self, - confluence_parameters: ConfluenceParameters, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> List[InformationPiece]: - """extract_from_confluence_post - - - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "404": None, - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ).data - - @validate_call - def extract_from_confluence_post_with_http_info( - self, - confluence_parameters: ConfluenceParameters, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> ApiResponse[List[InformationPiece]]: - """extract_from_confluence_post - - - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "404": None, - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ) - - @validate_call - def extract_from_confluence_post_without_preload_content( - self, - confluence_parameters: ConfluenceParameters, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> RESTResponseType: - """extract_from_confluence_post - - - :param confluence_parameters: (required) - :type confluence_parameters: ConfluenceParameters - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_from_confluence_post_serialize( - confluence_parameters=confluence_parameters, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "404": None, - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - return response_data.response - - def _extract_from_confluence_post_serialize( - self, - confluence_parameters, - _request_auth, - _content_type, - _headers, - _host_index, - ) -> RequestSerialized: - _host = None - - _collection_formats: Dict[str, str] = {} - - _path_params: Dict[str, str] = {} - _query_params: List[Tuple[str, str]] = [] - _header_params: Dict[str, Optional[str]] = _headers or {} - _form_params: List[Tuple[str, str]] = [] - _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} - _body_params: Optional[bytes] = None - - # process the path parameters - # process the query parameters - # process the header parameters - # process the form parameters - # process the body parameter - if confluence_parameters is not None: - _body_params = confluence_parameters - - # set the HTTP header `Accept` - if "Accept" not in _header_params: - _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) - - # set the HTTP header `Content-Type` - if _content_type: - _header_params["Content-Type"] = _content_type - else: - _default_content_type = self.api_client.select_header_content_type(["application/json"]) - if _default_content_type is not None: - _header_params["Content-Type"] = _default_content_type - - # authentication setting - _auth_settings: List[str] = [] - - return self.api_client.param_serialize( - method="POST", - resource_path="/extract_from_confluence", - path_params=_path_params, - query_params=_query_params, - header_params=_header_params, - body=_body_params, - post_params=_form_params, - files=_files, - auth_settings=_auth_settings, - collection_formats=_collection_formats, - _host=_host, - _request_auth=_request_auth, - ) @validate_call def extract_from_file_post( @@ -292,7 +45,10 @@ def extract_from_file_post( _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + Tuple[ + Annotated[StrictFloat, Field(gt=0)], + Annotated[StrictFloat, Field(gt=0)] + ] ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, @@ -324,28 +80,32 @@ def extract_from_file_post( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 _param = self._extract_from_file_post_serialize( extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index, + _host_index=_host_index ) _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, + '200': "List[InformationPiece]", + '422': None, + '500': None, } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data = self.api_client.call_api( + *_param, + _request_timeout=_request_timeout + ) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ).data + @validate_call def extract_from_file_post_with_http_info( self, @@ -353,7 +113,10 @@ def extract_from_file_post_with_http_info( _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + Tuple[ + Annotated[StrictFloat, Field(gt=0)], + Annotated[StrictFloat, Field(gt=0)] + ] ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, @@ -385,28 +148,32 @@ def extract_from_file_post_with_http_info( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 _param = self._extract_from_file_post_serialize( extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index, + _host_index=_host_index ) _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, + '200': "List[InformationPiece]", + '422': None, + '500': None, } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data = self.api_client.call_api( + *_param, + _request_timeout=_request_timeout + ) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ) + @validate_call def extract_from_file_post_without_preload_content( self, @@ -414,7 +181,10 @@ def extract_from_file_post_without_preload_content( _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + Tuple[ + Annotated[StrictFloat, Field(gt=0)], + Annotated[StrictFloat, Field(gt=0)] + ] ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, @@ -446,24 +216,28 @@ def extract_from_file_post_without_preload_content( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 _param = self._extract_from_file_post_serialize( extraction_request=extraction_request, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index, + _host_index=_host_index ) _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, + '200': "List[InformationPiece]", + '422': None, + '500': None, } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data = self.api_client.call_api( + *_param, + _request_timeout=_request_timeout + ) return response_data.response + def _extract_from_file_post_serialize( self, extraction_request, @@ -472,15 +246,19 @@ def _extract_from_file_post_serialize( _headers, _host_index, ) -> RequestSerialized: + _host = None - _collection_formats: Dict[str, str] = {} + _collection_formats: Dict[str, str] = { + } _path_params: Dict[str, str] = {} _query_params: List[Tuple[str, str]] = [] _header_params: Dict[str, Optional[str]] = _headers or {} _form_params: List[Tuple[str, str]] = [] - _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} + _files: Dict[ + str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]] + ] = {} _body_params: Optional[bytes] = None # process the path parameters @@ -491,24 +269,36 @@ def _extract_from_file_post_serialize( if extraction_request is not None: _body_params = extraction_request + # set the HTTP header `Accept` - if "Accept" not in _header_params: - _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) + if 'Accept' not in _header_params: + _header_params['Accept'] = self.api_client.select_header_accept( + [ + 'application/json' + ] + ) # set the HTTP header `Content-Type` if _content_type: - _header_params["Content-Type"] = _content_type + _header_params['Content-Type'] = _content_type else: - _default_content_type = self.api_client.select_header_content_type(["application/json"]) + _default_content_type = ( + self.api_client.select_header_content_type( + [ + 'application/json' + ] + ) + ) if _default_content_type is not None: - _header_params["Content-Type"] = _default_content_type + _header_params['Content-Type'] = _default_content_type # authentication setting - _auth_settings: List[str] = [] + _auth_settings: List[str] = [ + ] return self.api_client.param_serialize( - method="POST", - resource_path="/extract_from_file", + method='POST', + resource_path='/extract', path_params=_path_params, query_params=_query_params, header_params=_header_params, @@ -518,5 +308,7 @@ def _extract_from_file_post_serialize( auth_settings=_auth_settings, collection_formats=_collection_formats, _host=_host, - _request_auth=_request_auth, + _request_auth=_request_auth ) + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py index 911fd0d..befdba6 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -1,53 +1,47 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 import datetime +from dateutil.parser import parse +from enum import Enum import decimal import json import mimetypes import os import re import tempfile -from enum import Enum -from typing import Dict, List, Optional, Tuple, Union -from urllib.parse import quote -from dateutil.parser import parse +from urllib.parse import quote +from typing import Tuple, Optional, List, Dict, Union from pydantic import SecretStr +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT import admin_api_lib.extractor_api_client.openapi_client.models from admin_api_lib.extractor_api_client.openapi_client import rest -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.api_response import ( - T as ApiResponseT, -) -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration, -) from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiException, ApiValueError, + ApiException, BadRequestException, + UnauthorizedException, ForbiddenException, NotFoundException, - ServiceException, - UnauthorizedException, + ServiceException ) RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] - class ApiClient: """Generic API client for OpenAPI client library builds. @@ -66,19 +60,25 @@ class ApiClient: PRIMITIVE_TYPES = (float, bool, bytes, str, int) NATIVE_TYPES_MAPPING = { - "int": int, - "long": int, # TODO remove as only py3 is supported? - "float": float, - "str": str, - "bool": bool, - "date": datetime.date, - "datetime": datetime.datetime, - "decimal": decimal.Decimal, - "object": object, + 'int': int, + 'long': int, # TODO remove as only py3 is supported? + 'float': float, + 'str': str, + 'bool': bool, + 'date': datetime.date, + 'datetime': datetime.datetime, + 'decimal': decimal.Decimal, + 'object': object, } _pool = None - def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: + def __init__( + self, + configuration=None, + header_name=None, + header_value=None, + cookie=None + ) -> None: # use default configuration if none is provided if configuration is None: configuration = Configuration.get_default() @@ -90,7 +90,7 @@ def __init__(self, configuration=None, header_name=None, header_value=None, cook self.default_headers[header_name] = header_value self.cookie = cookie # Set default User-Agent. - self.user_agent = "OpenAPI-Generator/1.0.0/python" + self.user_agent = 'OpenAPI-Generator/1.0.0/python' self.client_side_validation = configuration.client_side_validation def __enter__(self): @@ -102,15 +102,16 @@ def __exit__(self, exc_type, exc_value, traceback): @property def user_agent(self): """User agent for this API client""" - return self.default_headers["User-Agent"] + return self.default_headers['User-Agent'] @user_agent.setter def user_agent(self, value): - self.default_headers["User-Agent"] = value + self.default_headers['User-Agent'] = value def set_default_header(self, header_name, header_value): self.default_headers[header_name] = header_value + _default = None @classmethod @@ -146,12 +147,12 @@ def param_serialize( header_params=None, body=None, post_params=None, - files=None, - auth_settings=None, + files=None, auth_settings=None, collection_formats=None, _host=None, - _request_auth=None, + _request_auth=None ) -> RequestSerialized: + """Builds the HTTP request params needed by the request. :param method: Method to call. :param resource_path: Path to method endpoint. @@ -180,30 +181,47 @@ def param_serialize( header_params = header_params or {} header_params.update(self.default_headers) if self.cookie: - header_params["Cookie"] = self.cookie + header_params['Cookie'] = self.cookie if header_params: header_params = self.sanitize_for_serialization(header_params) - header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) + header_params = dict( + self.parameters_to_tuples(header_params,collection_formats) + ) # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples(path_params, collection_formats) + path_params = self.parameters_to_tuples( + path_params, + collection_formats + ) for k, v in path_params: # specified safe chars, encode everything - resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) + resource_path = resource_path.replace( + '{%s}' % k, + quote(str(v), safe=config.safe_chars_for_path_param) + ) # post parameters if post_params or files: post_params = post_params if post_params else [] post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples(post_params, collection_formats) + post_params = self.parameters_to_tuples( + post_params, + collection_formats + ) if files: post_params.extend(self.files_parameters(files)) # auth setting self.update_params_for_auth( - header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth + header_params, + query_params, + auth_settings, + resource_path, + method, + body, + request_auth=_request_auth ) # body @@ -220,13 +238,23 @@ def param_serialize( # query parameters if query_params: query_params = self.sanitize_for_serialization(query_params) - url_query = self.parameters_to_url_query(query_params, collection_formats) + url_query = self.parameters_to_url_query( + query_params, + collection_formats + ) url += "?" + url_query return method, url, header_params, body, post_params + def call_api( - self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None + self, + method, + url, + header_params=None, + body=None, + post_params=None, + _request_timeout=None ) -> rest.RESTResponse: """Makes the HTTP request (synchronous) :param method: Method to call. @@ -243,12 +271,10 @@ def call_api( try: # perform request and return response response_data = self.rest_client.request( - method, - url, + method, url, headers=header_params, - body=body, - post_params=post_params, - _request_timeout=_request_timeout, + body=body, post_params=post_params, + _request_timeout=_request_timeout ) except ApiException as e: @@ -257,7 +283,9 @@ def call_api( return response_data def response_deserialize( - self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None + self, + response_data: rest.RESTResponse, + response_types_map: Optional[Dict[str, ApiResponseT]]=None ) -> ApiResponse[ApiResponseT]: """Deserializes response into an object. :param response_data: RESTResponse object to be deserialized. @@ -283,7 +311,7 @@ def response_deserialize( return_data = self.__deserialize_file(response_data) elif response_type is not None: match = None - content_type = response_data.getheader("content-type") + content_type = response_data.getheader('content-type') if content_type is not None: match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) encoding = match.group(1) if match else "utf-8" @@ -298,10 +326,10 @@ def response_deserialize( ) return ApiResponse( - status_code=response_data.status, - data=return_data, - headers=response_data.getheaders(), - raw_data=response_data.data, + status_code = response_data.status, + data = return_data, + headers = response_data.getheaders(), + raw_data = response_data.data ) def sanitize_for_serialization(self, obj): @@ -329,9 +357,13 @@ def sanitize_for_serialization(self, obj): elif isinstance(obj, self.PRIMITIVE_TYPES): return obj elif isinstance(obj, list): - return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] + return [ + self.sanitize_for_serialization(sub_obj) for sub_obj in obj + ] elif isinstance(obj, tuple): - return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) + return tuple( + self.sanitize_for_serialization(sub_obj) for sub_obj in obj + ) elif isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() elif isinstance(obj, decimal.Decimal): @@ -345,12 +377,15 @@ def sanitize_for_serialization(self, obj): # and attributes which value is not None. # Convert attribute name to json key in # model definition for request. - if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): + if hasattr(obj, 'to_dict') and callable(getattr(obj, 'to_dict')): obj_dict = obj.to_dict() else: obj_dict = obj.__dict__ - return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} + return { + key: self.sanitize_for_serialization(val) + for key, val in obj_dict.items() + } def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): """Deserializes response into an object. @@ -369,15 +404,18 @@ def deserialize(self, response_text: str, response_type: str, content_type: Opti data = json.loads(response_text) except ValueError: data = response_text - elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): + elif re.match(r'^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)', content_type, re.IGNORECASE): if response_text == "": data = "" else: data = json.loads(response_text) - elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): + elif re.match(r'^text\/[a-z.+-]+\s*(;|$)', content_type, re.IGNORECASE): data = response_text else: - raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) + raise ApiException( + status=0, + reason="Unsupported content type: {0}".format(content_type) + ) return self.__deserialize(data, response_type) @@ -393,17 +431,19 @@ def __deserialize(self, data, klass): return None if isinstance(klass, str): - if klass.startswith("List["): - m = re.match(r"List\[(.*)]", klass) + if klass.startswith('List['): + m = re.match(r'List\[(.*)]', klass) assert m is not None, "Malformed List type definition" sub_kls = m.group(1) - return [self.__deserialize(sub_data, sub_kls) for sub_data in data] + return [self.__deserialize(sub_data, sub_kls) + for sub_data in data] - if klass.startswith("Dict["): - m = re.match(r"Dict\[([^,]*), (.*)]", klass) + if klass.startswith('Dict['): + m = re.match(r'Dict\[([^,]*), (.*)]', klass) assert m is not None, "Malformed Dict type definition" sub_kls = m.group(2) - return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} + return {k: self.__deserialize(v, sub_kls) + for k, v in data.items()} # convert str to class if klass in self.NATIVE_TYPES_MAPPING: @@ -439,18 +479,19 @@ def parameters_to_tuples(self, params, collection_formats): for k, v in params.items() if isinstance(params, dict) else params: if k in collection_formats: collection_format = collection_formats[k] - if collection_format == "multi": + if collection_format == 'multi': new_params.extend((k, value) for value in v) else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" + if collection_format == 'ssv': + delimiter = ' ' + elif collection_format == 'tsv': + delimiter = '\t' + elif collection_format == 'pipes': + delimiter = '|' else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(str(value) for value in v))) + delimiter = ',' + new_params.append( + (k, delimiter.join(str(value) for value in v))) else: new_params.append((k, v)) return new_params @@ -475,18 +516,20 @@ def parameters_to_url_query(self, params, collection_formats): if k in collection_formats: collection_format = collection_formats[k] - if collection_format == "multi": + if collection_format == 'multi': new_params.extend((k, str(value)) for value in v) else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" + if collection_format == 'ssv': + delimiter = ' ' + elif collection_format == 'tsv': + delimiter = '\t' + elif collection_format == 'pipes': + delimiter = '|' else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(quote(str(value)) for value in v))) + delimiter = ',' + new_params.append( + (k, delimiter.join(quote(str(value)) for value in v)) + ) else: new_params.append((k, quote(str(v)))) @@ -504,7 +547,7 @@ def files_parameters( params = [] for k, v in files.items(): if isinstance(v, str): - with open(v, "rb") as f: + with open(v, 'rb') as f: filename = os.path.basename(f.name) filedata = f.read() elif isinstance(v, bytes): @@ -518,8 +561,13 @@ def files_parameters( continue else: raise ValueError("Unsupported file value") - mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" - params.append(tuple([k, tuple([filename, filedata, mimetype])])) + mimetype = ( + mimetypes.guess_type(filename)[0] + or 'application/octet-stream' + ) + params.append( + tuple([k, tuple([filename, filedata, mimetype])]) + ) return params def select_header_accept(self, accepts: List[str]) -> Optional[str]: @@ -532,7 +580,7 @@ def select_header_accept(self, accepts: List[str]) -> Optional[str]: return None for accept in accepts: - if re.search("json", accept, re.IGNORECASE): + if re.search('json', accept, re.IGNORECASE): return accept return accepts[0] @@ -547,13 +595,20 @@ def select_header_content_type(self, content_types): return None for content_type in content_types: - if re.search("json", content_type, re.IGNORECASE): + if re.search('json', content_type, re.IGNORECASE): return content_type return content_types[0] def update_params_for_auth( - self, headers, queries, auth_settings, resource_path, method, body, request_auth=None + self, + headers, + queries, + auth_settings, + resource_path, + method, + body, + request_auth=None ) -> None: """Updates header and query params based on authentication setting. @@ -571,14 +626,36 @@ def update_params_for_auth( return if request_auth: - self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) + self._apply_auth_params( + headers, + queries, + resource_path, + method, + body, + request_auth + ) else: for auth in auth_settings: auth_setting = self.configuration.auth_settings().get(auth) if auth_setting: - self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) - - def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: + self._apply_auth_params( + headers, + queries, + resource_path, + method, + body, + auth_setting + ) + + def _apply_auth_params( + self, + headers, + queries, + resource_path, + method, + body, + auth_setting + ) -> None: """Updates the request parameters based on a single auth_setting :param headers: Header parameters dict to be updated. @@ -589,15 +666,17 @@ def _apply_auth_params(self, headers, queries, resource_path, method, body, auth The object type is the return value of sanitize_for_serialization(). :param auth_setting: auth settings for the endpoint """ - if auth_setting["in"] == "cookie": - headers["Cookie"] = auth_setting["value"] - elif auth_setting["in"] == "header": - if auth_setting["type"] != "http-signature": - headers[auth_setting["key"]] = auth_setting["value"] - elif auth_setting["in"] == "query": - queries.append((auth_setting["key"], auth_setting["value"])) + if auth_setting['in'] == 'cookie': + headers['Cookie'] = auth_setting['value'] + elif auth_setting['in'] == 'header': + if auth_setting['type'] != 'http-signature': + headers[auth_setting['key']] = auth_setting['value'] + elif auth_setting['in'] == 'query': + queries.append((auth_setting['key'], auth_setting['value'])) else: - raise ApiValueError("Authentication token must be in `query` or `header`") + raise ApiValueError( + 'Authentication token must be in `query` or `header`' + ) def __deserialize_file(self, response): """Deserializes body to file @@ -617,7 +696,10 @@ def __deserialize_file(self, response): content_disposition = response.getheader("Content-Disposition") if content_disposition: - m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) + m = re.search( + r'filename=[\'"]?([^\'"\s]+)[\'"]?', + content_disposition + ) assert m is not None, "Unexpected 'content-disposition' header value" filename = m.group(1) path = os.path.join(os.path.dirname(path), filename) @@ -660,7 +742,10 @@ def __deserialize_date(self, string): except ImportError: return string except ValueError: - raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) + raise rest.ApiException( + status=0, + reason="Failed to parse `{0}` as date object".format(string) + ) def __deserialize_datetime(self, string): """Deserializes string to datetime. @@ -675,7 +760,13 @@ def __deserialize_datetime(self, string): except ImportError: return string except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) + raise rest.ApiException( + status=0, + reason=( + "Failed to parse `{0}` as datetime object" + .format(string) + ) + ) def __deserialize_enum(self, data, klass): """Deserializes primitive type to enum. @@ -687,7 +778,13 @@ def __deserialize_enum(self, data, klass): try: return klass(data) except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) + raise rest.ApiException( + status=0, + reason=( + "Failed to parse `{0}` as `{1}`" + .format(data, klass) + ) + ) def __deserialize_model(self, data, klass): """Deserializes list or dict to model. diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py index ca801da..9bc7c11 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -1,14 +1,11 @@ """API response object.""" from __future__ import annotations - -from typing import Generic, Mapping, Optional, TypeVar - -from pydantic import BaseModel, Field, StrictBytes, StrictInt +from typing import Optional, Generic, Mapping, TypeVar +from pydantic import Field, StrictInt, StrictBytes, BaseModel T = TypeVar("T") - class ApiResponse(BaseModel, Generic[T]): """ API response object @@ -19,4 +16,6 @@ class ApiResponse(BaseModel, Generic[T]): data: T = Field(description="Deserialized data given the data type") raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") - model_config = {"arbitrary_types_allowed": True} + model_config = { + "arbitrary_types_allowed": True + } diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py index de102b2..0b76ea2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -1,41 +1,33 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 import copy -import http.client as httplib import logging +from logging import FileHandler import multiprocessing import sys -from logging import FileHandler from typing import Optional - import urllib3 +import http.client as httplib + JSON_SCHEMA_VALIDATION_KEYWORDS = { - "multipleOf", - "maximum", - "exclusiveMaximum", - "minimum", - "exclusiveMinimum", - "maxLength", - "minLength", - "pattern", - "maxItems", - "minItems", + 'multipleOf', 'maximum', 'exclusiveMaximum', + 'minimum', 'exclusiveMinimum', 'maxLength', + 'minLength', 'pattern', 'maxItems', 'minItems' } - class Configuration: """This class contains various settings of the API client. @@ -71,25 +63,20 @@ class Configuration: _default = None - def __init__( - self, - host=None, - api_key=None, - api_key_prefix=None, - username=None, - password=None, - access_token=None, - server_index=None, - server_variables=None, - server_operation_index=None, - server_operation_variables=None, - ignore_operation_servers=False, - ssl_ca_cert=None, - retries=None, - *, - debug: Optional[bool] = None - ) -> None: - """Constructor""" + def __init__(self, host=None, + api_key=None, api_key_prefix=None, + username=None, password=None, + access_token=None, + server_index=None, server_variables=None, + server_operation_index=None, server_operation_variables=None, + ignore_operation_servers=False, + ssl_ca_cert=None, + retries=None, + *, + debug: Optional[bool] = None + ) -> None: + """Constructor + """ self._base_path = "http://localhost" if host is None else host """Default Base url """ @@ -135,7 +122,7 @@ def __init__( """ self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = "%(asctime)s %(levelname)s %(message)s" + self.logger_format = '%(asctime)s %(levelname)s %(message)s' """Log format """ self.logger_stream_handler = None @@ -190,7 +177,7 @@ def __init__( self.proxy_headers = None """Proxy headers """ - self.safe_chars_for_path_param = "" + self.safe_chars_for_path_param = '' """Safe chars for path_param """ self.retries = retries @@ -216,7 +203,7 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): - if k not in ("logger", "logger_file_handler"): + if k not in ('logger', 'logger_file_handler'): setattr(result, k, copy.deepcopy(v, memo)) # shallow copy of loggers result.logger = copy.copy(self.logger) @@ -376,7 +363,9 @@ def get_basic_auth_token(self): password = "" if self.password is not None: password = self.password - return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") + return urllib3.util.make_headers( + basic_auth=username + ':' + password + ).get('authorization') def auth_settings(self): """Gets Auth Settings dict for api client. @@ -391,13 +380,12 @@ def to_debug_report(self): :return: The report for debugging. """ - return ( - "Python SDK Debug Report:\n" - "OS: {env}\n" - "Python Version: {pyversion}\n" - "Version of the API: 1.0.0\n" - "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) - ) + return "Python SDK Debug Report:\n"\ + "OS: {env}\n"\ + "Python Version: {pyversion}\n"\ + "Version of the API: 1.0.0\n"\ + "SDK Package Version: 1.0.0".\ + format(env=sys.platform, pyversion=sys.version) def get_host_settings(self): """Gets an array of host settings @@ -406,8 +394,8 @@ def get_host_settings(self): """ return [ { - "url": "", - "description": "No description provided", + 'url': "", + 'description': "No description provided", } ] @@ -429,20 +417,22 @@ def get_host_from_settings(self, index, variables=None, servers=None): except IndexError: raise ValueError( "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers)) - ) + "Must be less than {1}".format(index, len(servers))) - url = server["url"] + url = server['url'] # go through variables and replace placeholders - for variable_name, variable in server.get("variables", {}).items(): - used_value = variables.get(variable_name, variable["default_value"]) + for variable_name, variable in server.get('variables', {}).items(): + used_value = variables.get( + variable_name, variable['default_value']) - if "enum_values" in variable and used_value not in variable["enum_values"]: + if 'enum_values' in variable \ + and used_value not in variable['enum_values']: raise ValueError( "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) - ) + "{1}. Must be {2}.".format( + variable_name, variables[variable_name], + variable['enum_values'])) url = url.replace("{" + variable_name + "}", used_value) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py index 877d8be..a5adf00 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -1,28 +1,27 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from typing import Any, Optional - from typing_extensions import Self - class OpenApiException(Exception): """The base exception class for all OpenAPIExceptions""" class ApiTypeError(OpenApiException, TypeError): - def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: - """Raises an exception for TypeErrors + def __init__(self, msg, path_to_item=None, valid_classes=None, + key_type=None) -> None: + """ Raises an exception for TypeErrors Args: msg (str): the exception message @@ -103,10 +102,11 @@ def __init__(self, msg, path_to_item=None) -> None: class ApiException(OpenApiException): + def __init__( - self, - status=None, - reason=None, + self, + status=None, + reason=None, http_resp=None, *, body: Optional[str] = None, @@ -125,17 +125,17 @@ def __init__( self.reason = http_resp.reason if self.body is None: try: - self.body = http_resp.data.decode("utf-8") + self.body = http_resp.data.decode('utf-8') except Exception: pass self.headers = http_resp.getheaders() @classmethod def from_response( - cls, - *, - http_resp, - body: Optional[str], + cls, + *, + http_resp, + body: Optional[str], data: Optional[Any], ) -> Self: if http_resp.status == 400: @@ -156,9 +156,11 @@ def from_response( def __str__(self): """Custom error messages for exception""" - error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) + error_message = "({0})\n"\ + "Reason: {1}\n".format(self.status, self.reason) if self.headers: - error_message += "HTTP response headers: {0}\n".format(self.headers) + error_message += "HTTP response headers: {0}\n".format( + self.headers) if self.data or self.body: error_message += "HTTP response body: {0}\n".format(self.data or self.body) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py index 4301aed..022896f 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -2,30 +2,19 @@ # flake8: noqa """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 # import models into model package -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( - InformationPiece, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py index c659e69..b797b12 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py @@ -1,36 +1,38 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json from enum import Enum - from typing_extensions import Self class ContentType(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - IMAGE = "IMAGE" - TABLE = "TABLE" - TEXT = "TEXT" + IMAGE = 'IMAGE' + TABLE = 'TABLE' + TEXT = 'TEXT' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py index 393ba17..db65003 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -1,33 +1,36 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set from typing_extensions import Self - class ExtractionRequest(BaseModel): - """ """ # noqa: E501 - - path_on_s3: StrictStr - __properties: ClassVar[List[str]] = ["path_on_s3"] + """ + + """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None + type: StrictStr + kwargs: Optional[List[KeyValuePair]] = None + __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] model_config = ConfigDict( populate_by_name=True, @@ -35,13 +38,15 @@ class ExtractionRequest(BaseModel): protected_namespaces=(), ) + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -58,13 +63,21 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([]) + excluded_fields: Set[str] = set([ + ]) _dict = self.model_dump( by_alias=True, exclude=excluded_fields, exclude_none=True, ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item_kwargs in self.kwargs: + if _item_kwargs: + _items.append(_item_kwargs.to_dict()) + _dict['kwargs'] = _items return _dict @classmethod @@ -76,5 +89,11 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3")}) + _obj = cls.model_validate({ + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] if obj.get("kwargs") is not None else None + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index a6d6c08..95a0fdb 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -1,40 +1,33 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set from typing_extensions import Self -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( - ContentType, -) -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import ( - KeyValuePair, -) - - class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 - + """ # noqa: E501 metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -46,13 +39,15 @@ class InformationPiece(BaseModel): protected_namespaces=(), ) + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -69,7 +64,8 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([]) + excluded_fields: Set[str] = set([ + ]) _dict = self.model_dump( by_alias=True, @@ -82,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item_metadata in self.metadata: if _item_metadata: _items.append(_item_metadata.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -94,15 +90,11 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] - if obj.get("metadata") is not None - else None - ), - "page_content": obj.get("page_content"), - "type": obj.get("type"), - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] if obj.get("metadata") is not None else None, + "page_content": obj.get("page_content"), + "type": obj.get("type") + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py index 80629a9..553288b 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py @@ -1,31 +1,31 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set +import json from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from typing import Optional, Set from typing_extensions import Self - class KeyValuePair(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -36,13 +36,15 @@ class KeyValuePair(BaseModel): protected_namespaces=(), ) + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Optional[Self]: @@ -59,7 +61,8 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([]) + excluded_fields: Set[str] = set([ + ]) _dict = self.model_dump( by_alias=True, @@ -69,12 +72,12 @@ def to_dict(self) -> Dict[str, Any]: # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict["key"] = None + _dict['key'] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict["value"] = None + _dict['value'] = None return _dict @@ -87,5 +90,10 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py index 09f1e39..32b1c3a 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -1,14 +1,14 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 @@ -19,10 +19,7 @@ import urllib3 -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiException, - ApiValueError, -) +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} RESTResponseType = urllib3.HTTPResponse @@ -39,6 +36,7 @@ def is_socks_proxy_url(url): class RESTResponse(io.IOBase): + def __init__(self, resp) -> None: self.response = resp self.status = resp.status @@ -60,6 +58,7 @@ def getheader(self, name, default=None): class RESTClientObject: + def __init__(self, configuration) -> None: # urllib3.PoolManager will pass all kw parameters to connectionpool # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 @@ -79,19 +78,22 @@ def __init__(self, configuration) -> None: "key_file": configuration.key_file, } if configuration.assert_hostname is not None: - pool_args["assert_hostname"] = configuration.assert_hostname + pool_args['assert_hostname'] = ( + configuration.assert_hostname + ) if configuration.retries is not None: - pool_args["retries"] = configuration.retries + pool_args['retries'] = configuration.retries if configuration.tls_server_name: - pool_args["server_hostname"] = configuration.tls_server_name + pool_args['server_hostname'] = configuration.tls_server_name + if configuration.socket_options is not None: - pool_args["socket_options"] = configuration.socket_options + pool_args['socket_options'] = configuration.socket_options if configuration.connection_pool_maxsize is not None: - pool_args["maxsize"] = configuration.connection_pool_maxsize + pool_args['maxsize'] = configuration.connection_pool_maxsize # https pool manager self.pool_manager: urllib3.PoolManager @@ -99,7 +101,6 @@ def __init__(self, configuration) -> None: if configuration.proxy: if is_socks_proxy_url(configuration.proxy): from urllib3.contrib.socks import SOCKSProxyManager - pool_args["proxy_url"] = configuration.proxy pool_args["headers"] = configuration.proxy_headers self.pool_manager = SOCKSProxyManager(**pool_args) @@ -110,7 +111,15 @@ def __init__(self, configuration) -> None: else: self.pool_manager = urllib3.PoolManager(**pool_args) - def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): + def request( + self, + method, + url, + headers=None, + body=None, + post_params=None, + _request_timeout=None + ): """Perform requests. :param method: http request method @@ -126,10 +135,20 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque (connection, read) timeouts. """ method = method.upper() - assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] + assert method in [ + 'GET', + 'HEAD', + 'DELETE', + 'POST', + 'PUT', + 'PATCH', + 'OPTIONS' + ] if post_params and body: - raise ApiValueError("body parameter cannot be used with post_params parameter.") + raise ApiValueError( + "body parameter cannot be used with post_params parameter." + ) post_params = post_params or {} headers = headers or {} @@ -138,22 +157,37 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque if _request_timeout: if isinstance(_request_timeout, (int, float)): timeout = urllib3.Timeout(total=_request_timeout) - elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: - timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) + elif ( + isinstance(_request_timeout, tuple) + and len(_request_timeout) == 2 + ): + timeout = urllib3.Timeout( + connect=_request_timeout[0], + read=_request_timeout[1] + ) try: # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: + if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']: + # no content type provided or payload is json - content_type = headers.get("Content-Type") - if not content_type or re.search("json", content_type, re.IGNORECASE): + content_type = headers.get('Content-Type') + if ( + not content_type + or re.search('json', content_type, re.IGNORECASE) + ): request_body = None if body is not None: request_body = json.dumps(body) r = self.pool_manager.request( - method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False + method, + url, + body=request_body, + timeout=timeout, + headers=headers, + preload_content=False ) - elif content_type == "application/x-www-form-urlencoded": + elif content_type == 'application/x-www-form-urlencoded': r = self.pool_manager.request( method, url, @@ -161,15 +195,15 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque encode_multipart=False, timeout=timeout, headers=headers, - preload_content=False, + preload_content=False ) - elif content_type == "multipart/form-data": + elif content_type == 'multipart/form-data': # must del headers['Content-Type'], or the correct # Content-Type which generated by urllib3 will be # overwritten. - del headers["Content-Type"] + del headers['Content-Type'] # Ensures that dict objects are serialized - post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] + post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a,b) for a, b in post_params] r = self.pool_manager.request( method, url, @@ -177,20 +211,29 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque encode_multipart=True, timeout=timeout, headers=headers, - preload_content=False, + preload_content=False ) # Pass a `string` parameter directly in the body to support # other content types than JSON when `body` argument is # provided in serialized form. elif isinstance(body, str) or isinstance(body, bytes): r = self.pool_manager.request( - method, url, body=body, timeout=timeout, headers=headers, preload_content=False + method, + url, + body=body, + timeout=timeout, + headers=headers, + preload_content=False ) - elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): + elif headers['Content-Type'].startswith('text/') and isinstance(body, bool): request_body = "true" if body else "false" r = self.pool_manager.request( - method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers - ) + method, + url, + body=request_body, + preload_content=False, + timeout=timeout, + headers=headers) else: # Cannot generate the request from given parameters msg = """Cannot prepare a request message for provided @@ -200,7 +243,12 @@ def request(self, method, url, headers=None, body=None, post_params=None, _reque # For `GET`, `HEAD` else: r = self.pool_manager.request( - method, url, fields={}, timeout=timeout, headers=headers, preload_content=False + method, + url, + fields={}, + timeout=timeout, + headers=headers, + preload_content=False ) except urllib3.exceptions.SSLError as e: msg = "\n".join([type(e).__name__, str(e)]) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py new file mode 100644 index 0000000..9704fc8 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -0,0 +1,33 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + +class TestContentType(unittest.TestCase): + """ContentType unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def testContentType(self): + """Test ContentType""" + # inst = ContentType() + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py new file mode 100644 index 0000000..fd48e16 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + +class TestExtractionRequest(unittest.TestCase): + """ExtractionRequest unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionRequest: + """Test ExtractionRequest + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # uncomment below to create an instance of `ExtractionRequest` + """ + model = ExtractionRequest() + if include_optional: + return ExtractionRequest( + file = bytes(b'blah'), + type = '', + kwargs = [ + {"value":"value","key":"key"} + ] + ) + else: + return ExtractionRequest( + type = '', + ) + """ + + def testExtractionRequest(self): + """Test ExtractionRequest""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py new file mode 100644 index 0000000..e76b68d --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -0,0 +1,37 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + + +class TestExtractorApi(unittest.TestCase): + """ExtractorApi unit test stubs""" + + def setUp(self) -> None: + self.api = ExtractorApi() + + def tearDown(self) -> None: + pass + + def test_extract_from_file_post(self) -> None: + """Test case for extract_from_file_post + + """ + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py new file mode 100644 index 0000000..0661af0 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -0,0 +1,60 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +class TestInformationPiece(unittest.TestCase): + """InformationPiece unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> InformationPiece: + """Test InformationPiece + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # uncomment below to create an instance of `InformationPiece` + """ + model = InformationPiece() + if include_optional: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE' + ) + else: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE', + ) + """ + + def testInformationPiece(self): + """Test InformationPiece""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py new file mode 100644 index 0000000..695ebb9 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -0,0 +1,52 @@ +# coding: utf-8 + +""" + extractor-api-lib + + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + +class TestKeyValuePair(unittest.TestCase): + """KeyValuePair unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> KeyValuePair: + """Test KeyValuePair + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included """ + # uncomment below to create an instance of `KeyValuePair` + """ + model = KeyValuePair() + if include_optional: + return KeyValuePair( + key = None, + value = None + ) + else: + return KeyValuePair( + ) + """ + + def testKeyValuePair(self): + """Test KeyValuePair""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + +if __name__ == '__main__': + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index e379f85..d00dfce 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -admin-api-lib + admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. + The API is used for the communication between the admin frontend and the admin backend in the rag project. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, StrictStr -from admin_api_lib.models.status import Status + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.models.status import Status try: from typing import Self except ImportError: from typing_extensions import Self - class DocumentStatus(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 name: StrictStr status: Status __properties: ClassVar[List[str]] = ["name", "status"] @@ -42,13 +42,15 @@ class DocumentStatus(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,7 +69,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) return _dict @@ -81,5 +84,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"name": obj.get("name"), "status": obj.get("status")}) + _obj = cls.model_validate({ + "name": obj.get("name"), + "status": obj.get("status") + }) return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/models/extra_models.py b/admin-api-lib/src/admin_api_lib/models/extra_models.py index f0588d2..a3a283f 100644 --- a/admin-api-lib/src/admin_api_lib/models/extra_models.py +++ b/admin-api-lib/src/admin_api_lib/models/extra_models.py @@ -2,7 +2,6 @@ from pydantic import BaseModel - class TokenModel(BaseModel): """Defines a token model.""" diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py new file mode 100644 index 0000000..8419cfa --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -0,0 +1,102 @@ +# coding: utf-8 + +""" + admin-api-lib + + The API is used for the communication between the admin frontend and the admin backend in the rag project. + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + + + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +class KeyValuePair(BaseModel): + """ + + """ # noqa: E501 + key: Optional[Any] = None + value: Optional[Any] = None + __properties: ClassVar[List[str]] = ["key", "value"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of KeyValuePair from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={ + }, + exclude_none=True, + ) + # set to None if key (nullable) is None + # and model_fields_set contains the field + if self.key is None and "key" in self.model_fields_set: + _dict['key'] = None + + # set to None if value (nullable) is None + # and model_fields_set contains the field + if self.value is None and "value" in self.model_fields_set: + _dict['value'] = None + + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of KeyValuePair from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) + return _obj + + diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 33f8f58..2e0de2c 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -admin-api-lib + admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. + The API is used for the communication between the admin frontend and the admin backend in the rag project. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,17 +27,21 @@ class Status(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - UPLOADING = "UPLOADING" - PROCESSING = "PROCESSING" - READY = "READY" - ERROR = "ERROR" + UPLOADING = 'UPLOADING' + PROCESSING = 'PROCESSING' + READY = 'READY' + ERROR = 'ERROR' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of Status from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py new file mode 100644 index 0000000..f76b987 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/upload_source.py @@ -0,0 +1,102 @@ +# coding: utf-8 + +""" + admin-api-lib + + The API is used for the communication between the admin frontend and the admin backend in the rag project. + + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) + + Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + + + +from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union +from admin_api_lib.models.key_value_pair import KeyValuePair +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +class UploadSource(BaseModel): + """ + + """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None + type: StrictStr + kwargs: Optional[List[KeyValuePair]] = None + __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of UploadSource from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={ + }, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item in self.kwargs: + if _item: + _items.append(_item.to_dict()) + _dict['kwargs'] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of UploadSource from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({ + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None + }) + return _obj + + diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index a6aea27..d949eb7 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -5,7 +5,7 @@ info: servers: - url: / paths: - /extract_from_file: + /extract: post: operationId: extract_from_file_post requestBody: @@ -29,47 +29,8 @@ paths: description: Something somewhere went terribly wrong. tags: - extractor - /extract_from_confluence: - post: - operationId: extract_from_confluence_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/confluence_parameters' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: ok - "404": - description: not found - "422": - description: unprocessable entity - "500": - description: internal server error - tags: - - extractor components: schemas: - extraction_request: - description: "" - example: - path_on_s3: path on s3 - properties: - path_on_s3: - description: "" - title: PathOnS3 - type: string - required: - - path_on_s3 - title: ExtractionRequest - type: object key_value_pair: description: "" example: @@ -120,54 +81,25 @@ components: - type title: InformationPiece type: object - confluence_parameters: + extraction_request: description: "" properties: - url: - description: url of the confluence space. - title: url - type: string - token: - description: api key to access confluence. - title: token - type: string - space_key: - description: the space key of the confluence pages. - title: space_key + file: + description: "" + format: binary + title: file type: string - include_attachments: - default: false - description: "whether to include file attachments (e.g., images, documents)\ - \ in the parsed content. Default is `false`." - title: include_attachments - type: boolean - keep_markdown_format: - default: true - description: whether to preserve markdown formatting in the output. Default - is `true`. - title: keep_markdown_format - type: boolean - keep_newlines: - default: true - description: whether to retain newline characters in the output for better - readability. Default is `true`. - title: keep_newlines - type: boolean - document_name: - description: The name that will be used to store the confluence db in the - key value db and the vectordatabase (metadata.document). - title: document_name + type: + description: "" + title: type type: string - confluence_kwargs: - description: Additional kwargs like verify_ssl + kwargs: + description: "" items: $ref: '#/components/schemas/key_value_pair' - title: confluence_kwargs + title: kwargs type: array required: - - document_name - - space_key - - token - - url - title: confluence_parameters + - type + title: extraction_request type: object diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 418a666..6246635 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,59 +1,42 @@ -"""Module for the Extractor API.""" - # coding: utf-8 -# noqa: D105 +from typing import Dict, List # noqa: F401 import importlib import pkgutil -from typing import List # noqa: F401 - -from fastapi import APIRouter, Body # noqa: F401 -import extractor_api_lib.impl from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +import openapi_server.impl + +from fastapi import ( # noqa: F401 + APIRouter, + Body, + Cookie, + Depends, + Form, + Header, + HTTPException, + Path, + Query, + Response, + Security, + status, +) + +from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 +from typing import Any, List from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece + router = APIRouter() -ns_pkg = extractor_api_lib.impl +ns_pkg = openapi_server.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) @router.post( - "/extract_from_confluence", - responses={ - 200: {"model": List[InformationPiece], "description": "ok"}, - 404: {"description": "not found"}, - 422: {"description": "unprocessable entity"}, - 500: {"description": "internal server error"}, - }, - tags=["extractor"], - response_model_by_alias=True, -) -async def extract_from_confluence_post( - confluence_parameters: ConfluenceParameters = Body(None, description=""), -) -> List[InformationPiece]: - """ - Extract information from a Confluence space. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The parameters required to access and extract information from the Confluence space. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces from the Confluence space. - """ - return await BaseExtractorApi.subclasses[0]().extract_from_confluence_post(confluence_parameters) - - -@router.post( - "/extract_from_file", + "/extract", responses={ 200: {"model": List[InformationPiece], "description": "List of extracted information."}, 422: {"description": "Body is not a valid PDF."}, @@ -65,17 +48,6 @@ async def extract_from_confluence_post( async def extract_from_file_post( extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: - """ - Extract information from a file based on the provided extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request object containing details about the extraction process. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces. - """ + if not BaseExtractorApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index 8f03f9c..a0b1fb5 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -1,63 +1,20 @@ -"""Module for the base ExtractorApi interface.""" - # coding: utf-8 -# flake8: noqa: D105 -from typing import ClassVar, List, Tuple # noqa: F401 +from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from typing import Any, List from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece class BaseExtractorApi: - """ - The base ExtractorApi interface. - - Attributes - ---------- - subclasses : ClassVar[Tuple] - A tuple containing all subclasses of BaseExtractorApi. - """ - subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - - async def extract_from_confluence_post( - self, - confluence_parameters: ConfluenceParameters, - ) -> List[InformationPiece]: - """ - Extract information from a Confluence space. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The parameters required to access and extract information from the Confluence space. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces from the Confluence space. - """ - async def extract_from_file_post( self, extraction_request: ExtractionRequest, ) -> List[InformationPiece]: - """ - Extract information from a file based on the provided extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request object containing details about the extraction process. - - Returns - ------- - List[InformationPiece] - A list of extracted information pieces. - """ + ... diff --git a/extractor-api-lib/src/extractor_api_lib/models/content_type.py b/extractor-api-lib/src/extractor_api_lib/models/content_type.py index 4e362d3..195f424 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/content_type.py +++ b/extractor-api-lib/src/extractor_api_lib/models/content_type.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,16 +27,20 @@ class ContentType(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - IMAGE = "IMAGE" - TABLE = "TABLE" - TEXT = "TEXT" + IMAGE = 'IMAGE' + TABLE = 'TABLE' + TEXT = 'TEXT' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py index f0588d2..a3a283f 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py @@ -2,7 +2,6 @@ from pydantic import BaseModel - class TokenModel(BaseModel): """Defines a token model.""" diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 3290aa7..437442f 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -1,37 +1,41 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + + -from pydantic import BaseModel, ConfigDict, StrictStr +from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union +from extractor_api_lib.models.key_value_pair import KeyValuePair try: from typing import Self except ImportError: from typing_extensions import Self - class ExtractionRequest(BaseModel): - """ """ # noqa: E501 - - path_on_s3: StrictStr - __properties: ClassVar[List[str]] = ["path_on_s3"] + """ + + """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None + type: StrictStr + kwargs: Optional[List[KeyValuePair]] = None + __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] model_config = { "populate_by_name": True, @@ -39,13 +43,15 @@ class ExtractionRequest(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -64,9 +70,17 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) + _items = [] + if self.kwargs: + for _item in self.kwargs: + if _item: + _items.append(_item.to_dict()) + _dict['kwargs'] = _items return _dict @classmethod @@ -78,5 +92,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3")}) + _obj = cls.model_validate({ + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None + }) return _obj + + diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 440f7a3..98261ff 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -1,40 +1,38 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + -from pydantic import BaseModel, ConfigDict, StrictStr + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List from extractor_api_lib.models.content_type import ContentType from extractor_api_lib.models.key_value_pair import KeyValuePair - try: from typing import Self except ImportError: from typing_extensions import Self - class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 - + """ # noqa: E501 metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -46,13 +44,15 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -71,7 +71,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -80,7 +81,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -92,15 +93,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] - if obj.get("metadata") is not None - else None - ), - "page_content": obj.get("page_content"), - "type": obj.get("type"), - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, + "page_content": obj.get("page_content"), + "type": obj.get("type") + }) return _obj + + diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index bdc5bb2..0cf865e 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -1,35 +1,36 @@ # coding: utf-8 """ -extractor-api-lib + extractor-api-lib -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json + + -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional try: from typing import Self except ImportError: from typing_extensions import Self - class KeyValuePair(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -40,13 +41,15 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -65,18 +68,19 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict["key"] = None + _dict['key'] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict["value"] = None + _dict['value'] = None return _dict @@ -89,5 +93,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index dda92db..425f48c 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -3,16 +3,16 @@ # coding: utf-8 # flake8: noqa: D105 +from typing import Dict, List # noqa: F401 import importlib -import logging import pkgutil -from asyncio import FIRST_COMPLETED, CancelledError, create_task, sleep, wait -from contextlib import suppress -from typing import Any, Awaitable, List # noqa: F401 + +from rag_core_api.apis.rag_api_base import BaseRagApi +import openapi_server.impl from fastapi import ( # noqa: F401 APIRouter, - BackgroundTasks, + BackgroundTasks, Body, Cookie, Depends, @@ -21,7 +21,7 @@ HTTPException, Path, Query, - Request, + Request, Response, Security, status, @@ -29,11 +29,16 @@ import rag_core_api.impl from rag_core_api.apis.rag_api_base import BaseRagApi +from rag_core_api.models.extra_models import TokenModel # noqa: F401 +from pydantic import Field, StrictStr +from typing import Any, List +from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest from rag_core_api.models.information_piece import InformationPiece + logger = logging.getLogger(__name__) router = APIRouter() @@ -52,7 +57,6 @@ async def _disconnected(request: Request) -> None: except CancelledError: break - @router.post( "/chat/{session_id}", responses={ @@ -64,8 +68,8 @@ async def _disconnected(request: Request) -> None: ) async def chat( request: Request, - session_id: str = Path(..., description=""), - chat_request: ChatRequest = Body(None, description="Chat with RAG."), + session_id: StrictStr = Path(..., description=""), + chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body(None, description="Chat with RAG."), ) -> ChatResponse | None: """ Asynchronously handles the chat endpoint for the RAG API. @@ -121,7 +125,8 @@ async def chat( tags=["rag"], response_model_by_alias=True, ) -async def evaluate() -> None: +async def evaluate( +) -> None: """ Asynchronously evaluate the RAG. @@ -129,6 +134,8 @@ async def evaluate() -> None: ------- None """ + if not BaseRagApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().evaluate() @@ -160,7 +167,9 @@ async def remove_information_piece( Returns ------- None - """ + """ + if not BaseRagApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().remove_information_piece(delete_request) @@ -191,5 +200,7 @@ async def upload_information_piece( Returns ------- None - """ + """ + if not BaseRagApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().upload_information_piece(information_piece) diff --git a/rag-core-api/src/rag_core_api/apis/rag_api_base.py b/rag-core-api/src/rag_core_api/apis/rag_api_base.py index 615230d..70d1406 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api_base.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api_base.py @@ -2,9 +2,11 @@ # coding: utf-8 # flake8: noqa: D105 - from typing import ClassVar, Dict, List, Tuple # noqa: F401 +from pydantic import Field, StrictStr +from typing import Any, List +from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest @@ -22,17 +24,15 @@ class BaseRagApi: subclasses : ClassVar[Tuple] A tuple that holds all subclasses of BaseRagApi. """ - subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseRagApi.subclasses = BaseRagApi.subclasses + (cls,) - async def chat( self, - session_id: str, - chat_request: ChatRequest, + session_id: StrictStr, + chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")], ) -> ChatResponse: """ Asynchronously handles the chat endpoint for the RAG API. @@ -52,6 +52,7 @@ async def chat( The chat response if the chat task completes successfully, otherwise None. """ + async def evaluate( self, ) -> None: @@ -63,6 +64,7 @@ async def evaluate( None """ + async def remove_information_piece( self, delete_request: DeleteRequest, @@ -82,6 +84,7 @@ async def remove_information_piece( None """ + async def upload_information_piece( self, information_piece: List[InformationPiece], diff --git a/rag-core-api/src/rag_core_api/models/chat_history.py b/rag-core-api/src/rag_core_api/models/chat_history.py index 5980dca..71e2e8c 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history.py +++ b/rag-core-api/src/rag_core_api/models/chat_history.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict -from rag_core_api.models.chat_history_message import ChatHistoryMessage + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List +from rag_core_api.models.chat_history_message import ChatHistoryMessage try: from typing import Self except ImportError: from typing_extensions import Self - class ChatHistory(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 messages: List[ChatHistoryMessage] __properties: ClassVar[List[str]] = ["messages"] @@ -41,13 +41,15 @@ class ChatHistory(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -66,7 +68,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in messages (list) @@ -75,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.messages: if _item: _items.append(_item.to_dict()) - _dict["messages"] = _items + _dict['messages'] = _items return _dict @classmethod @@ -87,13 +90,9 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "messages": ( - [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] - if obj.get("messages") is not None - else None - ) - } - ) + _obj = cls.model_validate({ + "messages": [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] if obj.get("messages") is not None else None + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_history_message.py b/rag-core-api/src/rag_core_api/models/chat_history_message.py index c664092..59da140 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history_message.py +++ b/rag-core-api/src/rag_core_api/models/chat_history_message.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, StrictStr -from rag_core_api.models.chat_role import ChatRole + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from rag_core_api.models.chat_role import ChatRole try: from typing import Self except ImportError: from typing_extensions import Self - class ChatHistoryMessage(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 role: ChatRole message: StrictStr __properties: ClassVar[List[str]] = ["role", "message"] @@ -42,13 +42,15 @@ class ChatHistoryMessage(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,7 +69,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) return _dict @@ -81,5 +84,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"role": obj.get("role"), "message": obj.get("message")}) + _obj = cls.model_validate({ + "role": obj.get("role"), + "message": obj.get("message") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_request.py b/rag-core-api/src/rag_core_api/models/chat_request.py index 1e0b135..9e28631 100644 --- a/rag-core-api/src/rag_core_api/models/chat_request.py +++ b/rag-core-api/src/rag_core_api/models/chat_request.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json -from pydantic import BaseModel, ConfigDict, StrictStr -from rag_core_api.models.chat_history import ChatHistory + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from rag_core_api.models.chat_history import ChatHistory try: from typing import Self except ImportError: from typing_extensions import Self - class ChatRequest(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 history: Optional[ChatHistory] = None message: StrictStr __properties: ClassVar[List[str]] = ["history", "message"] @@ -42,13 +42,15 @@ class ChatRequest(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,12 +69,13 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of history if self.history: - _dict["history"] = self.history.to_dict() + _dict['history'] = self.history.to_dict() return _dict @classmethod @@ -84,10 +87,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "history": (ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None), - "message": obj.get("message"), - } - ) + _obj = cls.model_validate({ + "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, + "message": obj.get("message") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_response.py b/rag-core-api/src/rag_core_api/models/chat_response.py index a0fcf44..6a8daad 100644 --- a/rag-core-api/src/rag_core_api/models/chat_response.py +++ b/rag-core-api/src/rag_core_api/models/chat_response.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json -from pydantic import BaseModel, ConfigDict, Field, StrictStr -from rag_core_api.models.information_piece import InformationPiece + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List +from rag_core_api.models.information_piece import InformationPiece try: from typing import Self except ImportError: from typing_extensions import Self - class ChatResponse(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 answer: StrictStr finish_reason: StrictStr = Field(description=" ") citations: List[InformationPiece] @@ -43,13 +43,15 @@ class ChatResponse(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -68,7 +70,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in citations (list) @@ -77,7 +80,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.citations: if _item: _items.append(_item.to_dict()) - _dict["citations"] = _items + _dict['citations'] = _items return _dict @classmethod @@ -89,15 +92,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "answer": obj.get("answer"), - "finish_reason": obj.get("finish_reason"), - "citations": ( - [SourceDocument.from_dict(_item) for _item in obj.get("citations")] - if obj.get("citations") is not None - else None - ), - } - ) + _obj = cls.model_validate({ + "answer": obj.get("answer"), + "finish_reason": obj.get("finish_reason"), + "citations": [InformationPiece.from_dict(_item) for _item in obj.get("citations")] if obj.get("citations") is not None else None + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/chat_role.py b/rag-core-api/src/rag_core_api/models/chat_role.py index cd2ff17..d0bef70 100644 --- a/rag-core-api/src/rag_core_api/models/chat_role.py +++ b/rag-core-api/src/rag_core_api/models/chat_role.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,15 +27,19 @@ class ChatRole(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - USER = "user" - ASSISTANT = "assistant" + USER = 'user' + ASSISTANT = 'assistant' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ChatRole from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/rag-core-api/src/rag_core_api/models/content_type.py b/rag-core-api/src/rag_core_api/models/content_type.py index 3d39928..df72d7d 100644 --- a/rag-core-api/src/rag_core_api/models/content_type.py +++ b/rag-core-api/src/rag_core_api/models/content_type.py @@ -1,24 +1,25 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - import json import pprint import re # noqa: F401 from enum import Enum + + try: from typing import Self except ImportError: @@ -26,17 +27,21 @@ class ContentType(str, Enum): - """ """ + """ + + """ """ allowed enum values """ - TEXT = "TEXT" - IMAGE = "IMAGE" - TABLE = "TABLE" - SUMMARY = "SUMMARY" + TEXT = 'TEXT' + IMAGE = 'IMAGE' + TABLE = 'TABLE' + SUMMARY = 'SUMMARY' @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) + + diff --git a/rag-core-api/src/rag_core_api/models/delete_request.py b/rag-core-api/src/rag_core_api/models/delete_request.py index 797dcf2..2c3592c 100644 --- a/rag-core-api/src/rag_core_api/models/delete_request.py +++ b/rag-core-api/src/rag_core_api/models/delete_request.py @@ -1,37 +1,37 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional +import json -from pydantic import BaseModel, ConfigDict -from rag_core_api.models.key_value_pair import KeyValuePair + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from rag_core_api.models.key_value_pair import KeyValuePair try: from typing import Self except ImportError: from typing_extensions import Self - class DeleteRequest(BaseModel): - """ """ # noqa: E501 - + """ + + """ # noqa: E501 metadata: Optional[List[KeyValuePair]] = None __properties: ClassVar[List[str]] = ["metadata"] @@ -41,13 +41,15 @@ class DeleteRequest(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -66,7 +68,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -75,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -87,13 +90,9 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] - if obj.get("metadata") is not None - else None - ) - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/extra_models.py b/rag-core-api/src/rag_core_api/models/extra_models.py index f0588d2..a3a283f 100644 --- a/rag-core-api/src/rag_core_api/models/extra_models.py +++ b/rag-core-api/src/rag_core_api/models/extra_models.py @@ -2,7 +2,6 @@ from pydantic import BaseModel - class TokenModel(BaseModel): """Defines a token model.""" diff --git a/rag-core-api/src/rag_core_api/models/information_piece.py b/rag-core-api/src/rag_core_api/models/information_piece.py index b85092f..28d5115 100644 --- a/rag-core-api/src/rag_core_api/models/information_piece.py +++ b/rag-core-api/src/rag_core_api/models/information_piece.py @@ -1,43 +1,39 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + -from pydantic import BaseModel, ConfigDict, Field, StrictStr + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List from rag_core_api.models.content_type import ContentType from rag_core_api.models.key_value_pair import KeyValuePair - try: from typing import Self except ImportError: from typing_extensions import Self - class InformationPiece(BaseModel): """ Uploading a json with chunks and metadata. - """ # noqa: E501 - - metadata: List[KeyValuePair] = Field( - description="The metadata of the documents that are stored in the vectordatabase." - ) + """ # noqa: E501 + metadata: List[KeyValuePair] = Field(description="The metadata of the documents that are stored in the vectordatabase.") page_content: StrictStr = Field(description="The content of the document") type: ContentType __properties: ClassVar[List[str]] = ["metadata", "page_content", "type"] @@ -48,13 +44,15 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -73,7 +71,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -82,7 +81,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict["metadata"] = _items + _dict['metadata'] = _items return _dict @classmethod @@ -94,15 +93,11 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "metadata": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] - if obj.get("metadata") is not None - else None - ), - "page_content": obj.get("page_content"), - "type": obj.get("type"), - } - ) + _obj = cls.model_validate({ + "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, + "page_content": obj.get("page_content"), + "type": obj.get("type") + }) return _obj + + diff --git a/rag-core-api/src/rag_core_api/models/key_value_pair.py b/rag-core-api/src/rag_core_api/models/key_value_pair.py index abf0986..b9654c3 100644 --- a/rag-core-api/src/rag_core_api/models/key_value_pair.py +++ b/rag-core-api/src/rag_core_api/models/key_value_pair.py @@ -1,37 +1,36 @@ # coding: utf-8 """ -RAG SIT x Stackit + RAG SIT x Stackit -The perfect rag solution. + The perfect rag solution. -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) + The version of the OpenAPI document: 1.0.0 + Generated by OpenAPI Generator (https://openapi-generator.tech) -Do not edit the class manually. + Do not edit the class manually. """ # noqa: E501 from __future__ import annotations - -import json import pprint import re # noqa: F401 -from typing import Any, ClassVar, Dict, List +import json + + -from pydantic import BaseModel, ConfigDict, Field, StrictStr +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self except ImportError: from typing_extensions import Self - class KeyValuePair(BaseModel): """ The key value pair. - """ # noqa: E501 - + """ # noqa: E501 key: StrictStr value: StrictStr = Field(description=" ") __properties: ClassVar[List[str]] = ["key", "value"] @@ -42,13 +41,15 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } + def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: @@ -67,7 +68,8 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={}, + exclude={ + }, exclude_none=True, ) return _dict @@ -81,5 +83,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) + _obj = cls.model_validate({ + "key": obj.get("key"), + "value": obj.get("value") + }) return _obj + + From cae32ed28faa338ae9d9917e95144dfda5e787cf Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 9 May 2025 14:46:02 +0200 Subject: [PATCH 02/56] api change --- admin-api-lib/openapi.yaml | 14 +- .../api_endpoints/source_uploader.py | 14 + .../src/admin_api_lib/apis/admin_api.py | 22 +- .../src/admin_api_lib/apis/admin_api_base.py | 24 +- .../openapi_client/__init__.py | 11 +- .../openapi_client/api/__init__.py | 1 - .../openapi_client/api/extractor_api.py | 211 +++++++------ .../openapi_client/api_client.py | 292 ++++++------------ .../openapi_client/api_response.py | 5 +- .../openapi_client/configuration.py | 102 +++--- .../openapi_client/exceptions.py | 38 ++- .../openapi_client/models/__init__.py | 11 +- .../openapi_client/models/content_type.py | 22 +- .../models/extraction_request.py | 40 +-- .../models/information_piece.py | 38 ++- .../openapi_client/models/key_value_pair.py | 31 +- .../openapi_client/rest.py | 110 ++----- .../openapi_client/test/test_content_type.py | 14 +- .../test/test_extraction_request.py | 20 +- .../openapi_client/test/test_extractor_api.py | 16 +- .../test/test_information_piece.py | 20 +- .../test/test_key_value_pair.py | 20 +- .../admin_api_lib/models/document_status.py | 30 +- .../src/admin_api_lib/models/extra_models.py | 1 + .../admin_api_lib/models/key_value_pair.py | 34 +- .../src/admin_api_lib/models/status.py | 25 +- .../src/admin_api_lib/models/upload_source.py | 43 +-- extractor-api-lib/openapi.yaml | 14 +- .../extractor_api_lib/apis/extractor_api.py | 17 +- .../apis/extractor_api_base.py | 17 +- .../extractor_api_lib/models/content_type.py | 23 +- .../extractor_api_lib/models/extra_models.py | 1 + .../models/extraction_request.py | 43 +-- .../models/information_piece.py | 41 +-- .../models/key_value_pair.py | 34 +- rag-core-api/src/rag_core_api/apis/rag_api.py | 16 +- .../src/rag_core_api/apis/rag_api_base.py | 5 +- .../src/rag_core_api/models/chat_history.py | 39 +-- .../models/chat_history_message.py | 30 +- .../src/rag_core_api/models/chat_request.py | 37 +-- .../src/rag_core_api/models/chat_response.py | 43 +-- .../src/rag_core_api/models/chat_role.py | 21 +- .../src/rag_core_api/models/content_type.py | 25 +- .../src/rag_core_api/models/delete_request.py | 39 +-- .../src/rag_core_api/models/extra_models.py | 1 + .../rag_core_api/models/information_piece.py | 45 +-- .../src/rag_core_api/models/key_value_pair.py | 28 +- 47 files changed, 782 insertions(+), 946 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index 1b8255a..efbb2f6 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -83,14 +83,14 @@ paths: - admin /upload_source: post: - description: Uploads user selected pdf documents. + description: Uploads user selected sources. operationId: upload_source requestBody: content: - application/pdf: + multipart/form-data: schema: $ref: '#/components/schemas/upload_source' - description: The PDF document to upload. + description: The source to upload. required: true responses: "200": @@ -137,21 +137,21 @@ components: file: description: "" format: binary - title: file type: string type: description: "" - title: type type: string kwargs: description: "" items: $ref: '#/components/schemas/key_value_pair' - title: kwargs type: array + name: + description: "" + type: string required: + - name - type - title: upload_source type: object key_value_pair: description: "" diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py new file mode 100644 index 0000000..2cfbf2f --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -0,0 +1,14 @@ +from dataclasses import Field +from typing_extensions import Annotated +from abc import ABC, abstractmethod + +from admin_api_lib.models.upload_source import UploadSource + + +class SourceUploader(ABC): + + @abstractmethod + async def upload_source( + self, + upload_source: Annotated[UploadSource, Field(description="The source to upload.")], + ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 622cd5a..81d55f5 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,5 +1,3 @@ -"""Module for the Admin API.""" - # coding: utf-8 from typing import Dict, List # noqa: F401 @@ -28,10 +26,10 @@ from admin_api_lib.models.extra_models import TokenModel # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union +from typing import Any, List, Optional, Tuple, Union from typing_extensions import Annotated from admin_api_lib.models.document_status import DocumentStatus -from admin_api_lib.models.upload_source import UploadSource +from admin_api_lib.models.key_value_pair import KeyValuePair router = APIRouter() @@ -101,17 +99,16 @@ async def document_reference_id_get( raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) - @router.get( "/all_documents_status", responses={ - 200: {"model": list[DocumentStatus], "description": "list of document links"}, + 200: {"model": List[DocumentStatus], "description": "List of document links"}, 500: {"description": "Internal server error"}, }, tags=["admin"], response_model_by_alias=True, ) -async def get_all_documents_status() -> list[DocumentStatus]: +async def get_all_documents_status() -> List[DocumentStatus]: """ Asynchronously retrieves the status of all documents. @@ -119,7 +116,7 @@ async def get_all_documents_status() -> list[DocumentStatus]: ------- list[DocumentStatus] A list containing the status of all documents. - """ + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() @@ -137,9 +134,12 @@ async def get_all_documents_status() -> list[DocumentStatus]: response_model_by_alias=True, ) async def upload_source( - upload_source: Annotated[UploadSource, Field(description="The source to upload.")] = Body(None, description="The source to upload."), + type: StrictStr = Form(None, description=""), + name: StrictStr = Form(None, description=""), + file: Optional[UploadFile] = Form(None, description=""), + kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), ) -> None: - """Uploads user selected source.""" + """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(upload_source) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index efeb120..34bce77 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -3,24 +3,14 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union +from typing import Any, List, Optional, Tuple, Union from typing_extensions import Annotated from fastapi import Request, Response, UploadFile - from admin_api_lib.models.document_status import DocumentStatus -from admin_api_lib.models.upload_source import UploadSource +from admin_api_lib.models.key_value_pair import KeyValuePair class BaseAdminApi: - """ - The base AdminApi interface. - - Attributes - ---------- - subclasses : ClassVar[Tuple] - A tuple that holds all subclasses of BaseAdminApi. - """ - subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): @@ -44,10 +34,9 @@ async def delete_document( None """ - async def document_reference_id_get( self, - identification: str, + identification: Annotated[StrictStr, Field(description="Identifier of the pdf document.")], ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -76,9 +65,12 @@ async def get_all_documents_status( A list containing the status of all documents. """ - async def upload_source( self, - upload_source: Annotated[UploadSource, Field(description="The PDF document to upload.")], + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], ) -> None: + """Uploads user selected sources.""" ... diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py index ae86262..f43e4e9 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -3,14 +3,14 @@ # flake8: noqa """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -32,6 +32,5 @@ # import models into sdk package from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py index 792725e..c95ce65 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -2,4 +2,3 @@ # import apis into api package from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py index e4a0fa6..1a862d3 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 import warnings @@ -16,9 +16,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union from typing_extensions import Annotated -from typing import List -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from pydantic import StrictBytes, StrictStr +from typing import List, Optional, Tuple, Union from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse @@ -37,29 +38,34 @@ def __init__(self, api_client=None) -> None: api_client = ApiClient.get_default() self.api_client = api_client - @validate_call - def extract_from_file_post( + def extract( self, - extraction_request: ExtractionRequest, + type: StrictStr, + name: StrictStr, + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, + kwargs: Optional[List[KeyValuePair]] = None, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[ - Annotated[StrictFloat, Field(gt=0)], - Annotated[StrictFloat, Field(gt=0)] - ] + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> List[InformationPiece]: - """extract_from_file_post + """extract - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param type: (required) + :type type: str + :param name: (required) + :type name: str + :param file: + :type file: bytearray + :param kwargs: + :type kwargs: List[KeyValuePair] :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -80,54 +86,59 @@ def extract_from_file_post( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_serialize( + type=type, + name=name, + file=file, + kwargs=kwargs, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index + _host_index=_host_index, ) _response_types_map: Dict[str, Optional[str]] = { - '200': "List[InformationPiece]", - '422': None, - '500': None, + "200": "List[InformationPiece]", + "422": None, + "500": None, } - response_data = self.api_client.call_api( - *_param, - _request_timeout=_request_timeout - ) + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ).data - @validate_call - def extract_from_file_post_with_http_info( + def extract_with_http_info( self, - extraction_request: ExtractionRequest, + type: StrictStr, + name: StrictStr, + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, + kwargs: Optional[List[KeyValuePair]] = None, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[ - Annotated[StrictFloat, Field(gt=0)], - Annotated[StrictFloat, Field(gt=0)] - ] + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> ApiResponse[List[InformationPiece]]: - """extract_from_file_post + """extract - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param type: (required) + :type type: str + :param name: (required) + :type name: str + :param file: + :type file: bytearray + :param kwargs: + :type kwargs: List[KeyValuePair] :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -148,54 +159,59 @@ def extract_from_file_post_with_http_info( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_serialize( + type=type, + name=name, + file=file, + kwargs=kwargs, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index + _host_index=_host_index, ) _response_types_map: Dict[str, Optional[str]] = { - '200': "List[InformationPiece]", - '422': None, - '500': None, + "200": "List[InformationPiece]", + "422": None, + "500": None, } - response_data = self.api_client.call_api( - *_param, - _request_timeout=_request_timeout - ) + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) response_data.read() return self.api_client.response_deserialize( response_data=response_data, response_types_map=_response_types_map, ) - @validate_call - def extract_from_file_post_without_preload_content( + def extract_without_preload_content( self, - extraction_request: ExtractionRequest, + type: StrictStr, + name: StrictStr, + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, + kwargs: Optional[List[KeyValuePair]] = None, _request_timeout: Union[ None, Annotated[StrictFloat, Field(gt=0)], - Tuple[ - Annotated[StrictFloat, Field(gt=0)], - Annotated[StrictFloat, Field(gt=0)] - ] + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], ] = None, _request_auth: Optional[Dict[StrictStr, Any]] = None, _content_type: Optional[StrictStr] = None, _headers: Optional[Dict[StrictStr, Any]] = None, _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, ) -> RESTResponseType: - """extract_from_file_post + """extract - :param extraction_request: (required) - :type extraction_request: ExtractionRequest + :param type: (required) + :type type: str + :param name: (required) + :type name: str + :param file: + :type file: bytearray + :param kwargs: + :type kwargs: List[KeyValuePair] :param _request_timeout: timeout setting for this request. If one number provided, it will be total request timeout. It can also be a pair (tuple) of @@ -216,31 +232,33 @@ def extract_from_file_post_without_preload_content( in the spec for a single request. :type _host_index: int, optional :return: Returns the result object. - """ # noqa: E501 + """ # noqa: E501 - _param = self._extract_from_file_post_serialize( - extraction_request=extraction_request, + _param = self._extract_serialize( + type=type, + name=name, + file=file, + kwargs=kwargs, _request_auth=_request_auth, _content_type=_content_type, _headers=_headers, - _host_index=_host_index + _host_index=_host_index, ) _response_types_map: Dict[str, Optional[str]] = { - '200': "List[InformationPiece]", - '422': None, - '500': None, + "200": "List[InformationPiece]", + "422": None, + "500": None, } - response_data = self.api_client.call_api( - *_param, - _request_timeout=_request_timeout - ) + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) return response_data.response - - def _extract_from_file_post_serialize( + def _extract_serialize( self, - extraction_request, + type, + name, + file, + kwargs, _request_auth, _content_type, _headers, @@ -250,55 +268,48 @@ def _extract_from_file_post_serialize( _host = None _collection_formats: Dict[str, str] = { + "kwargs": "csv", } _path_params: Dict[str, str] = {} _query_params: List[Tuple[str, str]] = [] _header_params: Dict[str, Optional[str]] = _headers or {} _form_params: List[Tuple[str, str]] = [] - _files: Dict[ - str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]] - ] = {} + _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} _body_params: Optional[bytes] = None # process the path parameters # process the query parameters # process the header parameters # process the form parameters + if file is not None: + _files["file"] = file + if type is not None: + _form_params.append(("type", type)) + if kwargs is not None: + _form_params.append(("kwargs", kwargs)) + if name is not None: + _form_params.append(("name", name)) # process the body parameter - if extraction_request is not None: - _body_params = extraction_request - # set the HTTP header `Accept` - if 'Accept' not in _header_params: - _header_params['Accept'] = self.api_client.select_header_accept( - [ - 'application/json' - ] - ) + if "Accept" not in _header_params: + _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) # set the HTTP header `Content-Type` if _content_type: - _header_params['Content-Type'] = _content_type + _header_params["Content-Type"] = _content_type else: - _default_content_type = ( - self.api_client.select_header_content_type( - [ - 'application/json' - ] - ) - ) + _default_content_type = self.api_client.select_header_content_type(["multipart/form-data"]) if _default_content_type is not None: - _header_params['Content-Type'] = _default_content_type + _header_params["Content-Type"] = _default_content_type # authentication setting - _auth_settings: List[str] = [ - ] + _auth_settings: List[str] = [] return self.api_client.param_serialize( - method='POST', - resource_path='/extract', + method="POST", + resource_path="/extract", path_params=_path_params, query_params=_query_params, header_params=_header_params, @@ -308,7 +319,5 @@ def _extract_from_file_post_serialize( auth_settings=_auth_settings, collection_formats=_collection_formats, _host=_host, - _request_auth=_request_auth + _request_auth=_request_auth, ) - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py index befdba6..ba8f5d2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -37,11 +37,12 @@ UnauthorizedException, ForbiddenException, NotFoundException, - ServiceException + ServiceException, ) RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] + class ApiClient: """Generic API client for OpenAPI client library builds. @@ -60,25 +61,19 @@ class ApiClient: PRIMITIVE_TYPES = (float, bool, bytes, str, int) NATIVE_TYPES_MAPPING = { - 'int': int, - 'long': int, # TODO remove as only py3 is supported? - 'float': float, - 'str': str, - 'bool': bool, - 'date': datetime.date, - 'datetime': datetime.datetime, - 'decimal': decimal.Decimal, - 'object': object, + "int": int, + "long": int, # TODO remove as only py3 is supported? + "float": float, + "str": str, + "bool": bool, + "date": datetime.date, + "datetime": datetime.datetime, + "decimal": decimal.Decimal, + "object": object, } _pool = None - def __init__( - self, - configuration=None, - header_name=None, - header_value=None, - cookie=None - ) -> None: + def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: # use default configuration if none is provided if configuration is None: configuration = Configuration.get_default() @@ -90,7 +85,7 @@ def __init__( self.default_headers[header_name] = header_value self.cookie = cookie # Set default User-Agent. - self.user_agent = 'OpenAPI-Generator/1.0.0/python' + self.user_agent = "OpenAPI-Generator/1.0.0/python" self.client_side_validation = configuration.client_side_validation def __enter__(self): @@ -102,16 +97,15 @@ def __exit__(self, exc_type, exc_value, traceback): @property def user_agent(self): """User agent for this API client""" - return self.default_headers['User-Agent'] + return self.default_headers["User-Agent"] @user_agent.setter def user_agent(self, value): - self.default_headers['User-Agent'] = value + self.default_headers["User-Agent"] = value def set_default_header(self, header_name, header_value): self.default_headers[header_name] = header_value - _default = None @classmethod @@ -147,12 +141,12 @@ def param_serialize( header_params=None, body=None, post_params=None, - files=None, auth_settings=None, + files=None, + auth_settings=None, collection_formats=None, _host=None, - _request_auth=None + _request_auth=None, ) -> RequestSerialized: - """Builds the HTTP request params needed by the request. :param method: Method to call. :param resource_path: Path to method endpoint. @@ -181,47 +175,30 @@ def param_serialize( header_params = header_params or {} header_params.update(self.default_headers) if self.cookie: - header_params['Cookie'] = self.cookie + header_params["Cookie"] = self.cookie if header_params: header_params = self.sanitize_for_serialization(header_params) - header_params = dict( - self.parameters_to_tuples(header_params,collection_formats) - ) + header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) # path parameters if path_params: path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples( - path_params, - collection_formats - ) + path_params = self.parameters_to_tuples(path_params, collection_formats) for k, v in path_params: # specified safe chars, encode everything - resource_path = resource_path.replace( - '{%s}' % k, - quote(str(v), safe=config.safe_chars_for_path_param) - ) + resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) # post parameters if post_params or files: post_params = post_params if post_params else [] post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples( - post_params, - collection_formats - ) + post_params = self.parameters_to_tuples(post_params, collection_formats) if files: post_params.extend(self.files_parameters(files)) # auth setting self.update_params_for_auth( - header_params, - query_params, - auth_settings, - resource_path, - method, - body, - request_auth=_request_auth + header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth ) # body @@ -238,23 +215,13 @@ def param_serialize( # query parameters if query_params: query_params = self.sanitize_for_serialization(query_params) - url_query = self.parameters_to_url_query( - query_params, - collection_formats - ) + url_query = self.parameters_to_url_query(query_params, collection_formats) url += "?" + url_query return method, url, header_params, body, post_params - def call_api( - self, - method, - url, - header_params=None, - body=None, - post_params=None, - _request_timeout=None + self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None ) -> rest.RESTResponse: """Makes the HTTP request (synchronous) :param method: Method to call. @@ -271,10 +238,12 @@ def call_api( try: # perform request and return response response_data = self.rest_client.request( - method, url, + method, + url, headers=header_params, - body=body, post_params=post_params, - _request_timeout=_request_timeout + body=body, + post_params=post_params, + _request_timeout=_request_timeout, ) except ApiException as e: @@ -283,9 +252,7 @@ def call_api( return response_data def response_deserialize( - self, - response_data: rest.RESTResponse, - response_types_map: Optional[Dict[str, ApiResponseT]]=None + self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None ) -> ApiResponse[ApiResponseT]: """Deserializes response into an object. :param response_data: RESTResponse object to be deserialized. @@ -311,7 +278,7 @@ def response_deserialize( return_data = self.__deserialize_file(response_data) elif response_type is not None: match = None - content_type = response_data.getheader('content-type') + content_type = response_data.getheader("content-type") if content_type is not None: match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) encoding = match.group(1) if match else "utf-8" @@ -326,10 +293,10 @@ def response_deserialize( ) return ApiResponse( - status_code = response_data.status, - data = return_data, - headers = response_data.getheaders(), - raw_data = response_data.data + status_code=response_data.status, + data=return_data, + headers=response_data.getheaders(), + raw_data=response_data.data, ) def sanitize_for_serialization(self, obj): @@ -357,13 +324,9 @@ def sanitize_for_serialization(self, obj): elif isinstance(obj, self.PRIMITIVE_TYPES): return obj elif isinstance(obj, list): - return [ - self.sanitize_for_serialization(sub_obj) for sub_obj in obj - ] + return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] elif isinstance(obj, tuple): - return tuple( - self.sanitize_for_serialization(sub_obj) for sub_obj in obj - ) + return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) elif isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() elif isinstance(obj, decimal.Decimal): @@ -377,15 +340,12 @@ def sanitize_for_serialization(self, obj): # and attributes which value is not None. # Convert attribute name to json key in # model definition for request. - if hasattr(obj, 'to_dict') and callable(getattr(obj, 'to_dict')): + if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): obj_dict = obj.to_dict() else: obj_dict = obj.__dict__ - return { - key: self.sanitize_for_serialization(val) - for key, val in obj_dict.items() - } + return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): """Deserializes response into an object. @@ -404,18 +364,15 @@ def deserialize(self, response_text: str, response_type: str, content_type: Opti data = json.loads(response_text) except ValueError: data = response_text - elif re.match(r'^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)', content_type, re.IGNORECASE): + elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): if response_text == "": data = "" else: data = json.loads(response_text) - elif re.match(r'^text\/[a-z.+-]+\s*(;|$)', content_type, re.IGNORECASE): + elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): data = response_text else: - raise ApiException( - status=0, - reason="Unsupported content type: {0}".format(content_type) - ) + raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) return self.__deserialize(data, response_type) @@ -431,19 +388,17 @@ def __deserialize(self, data, klass): return None if isinstance(klass, str): - if klass.startswith('List['): - m = re.match(r'List\[(.*)]', klass) + if klass.startswith("List["): + m = re.match(r"List\[(.*)]", klass) assert m is not None, "Malformed List type definition" sub_kls = m.group(1) - return [self.__deserialize(sub_data, sub_kls) - for sub_data in data] + return [self.__deserialize(sub_data, sub_kls) for sub_data in data] - if klass.startswith('Dict['): - m = re.match(r'Dict\[([^,]*), (.*)]', klass) + if klass.startswith("Dict["): + m = re.match(r"Dict\[([^,]*), (.*)]", klass) assert m is not None, "Malformed Dict type definition" sub_kls = m.group(2) - return {k: self.__deserialize(v, sub_kls) - for k, v in data.items()} + return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} # convert str to class if klass in self.NATIVE_TYPES_MAPPING: @@ -479,19 +434,18 @@ def parameters_to_tuples(self, params, collection_formats): for k, v in params.items() if isinstance(params, dict) else params: if k in collection_formats: collection_format = collection_formats[k] - if collection_format == 'multi': + if collection_format == "multi": new_params.extend((k, value) for value in v) else: - if collection_format == 'ssv': - delimiter = ' ' - elif collection_format == 'tsv': - delimiter = '\t' - elif collection_format == 'pipes': - delimiter = '|' + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" else: # csv is the default - delimiter = ',' - new_params.append( - (k, delimiter.join(str(value) for value in v))) + delimiter = "," + new_params.append((k, delimiter.join(str(value) for value in v))) else: new_params.append((k, v)) return new_params @@ -516,20 +470,18 @@ def parameters_to_url_query(self, params, collection_formats): if k in collection_formats: collection_format = collection_formats[k] - if collection_format == 'multi': + if collection_format == "multi": new_params.extend((k, str(value)) for value in v) else: - if collection_format == 'ssv': - delimiter = ' ' - elif collection_format == 'tsv': - delimiter = '\t' - elif collection_format == 'pipes': - delimiter = '|' + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" else: # csv is the default - delimiter = ',' - new_params.append( - (k, delimiter.join(quote(str(value)) for value in v)) - ) + delimiter = "," + new_params.append((k, delimiter.join(quote(str(value)) for value in v))) else: new_params.append((k, quote(str(v)))) @@ -547,7 +499,7 @@ def files_parameters( params = [] for k, v in files.items(): if isinstance(v, str): - with open(v, 'rb') as f: + with open(v, "rb") as f: filename = os.path.basename(f.name) filedata = f.read() elif isinstance(v, bytes): @@ -561,13 +513,8 @@ def files_parameters( continue else: raise ValueError("Unsupported file value") - mimetype = ( - mimetypes.guess_type(filename)[0] - or 'application/octet-stream' - ) - params.append( - tuple([k, tuple([filename, filedata, mimetype])]) - ) + mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" + params.append(tuple([k, tuple([filename, filedata, mimetype])])) return params def select_header_accept(self, accepts: List[str]) -> Optional[str]: @@ -580,7 +527,7 @@ def select_header_accept(self, accepts: List[str]) -> Optional[str]: return None for accept in accepts: - if re.search('json', accept, re.IGNORECASE): + if re.search("json", accept, re.IGNORECASE): return accept return accepts[0] @@ -595,20 +542,13 @@ def select_header_content_type(self, content_types): return None for content_type in content_types: - if re.search('json', content_type, re.IGNORECASE): + if re.search("json", content_type, re.IGNORECASE): return content_type return content_types[0] def update_params_for_auth( - self, - headers, - queries, - auth_settings, - resource_path, - method, - body, - request_auth=None + self, headers, queries, auth_settings, resource_path, method, body, request_auth=None ) -> None: """Updates header and query params based on authentication setting. @@ -626,36 +566,14 @@ def update_params_for_auth( return if request_auth: - self._apply_auth_params( - headers, - queries, - resource_path, - method, - body, - request_auth - ) + self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) else: for auth in auth_settings: auth_setting = self.configuration.auth_settings().get(auth) if auth_setting: - self._apply_auth_params( - headers, - queries, - resource_path, - method, - body, - auth_setting - ) - - def _apply_auth_params( - self, - headers, - queries, - resource_path, - method, - body, - auth_setting - ) -> None: + self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) + + def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: """Updates the request parameters based on a single auth_setting :param headers: Header parameters dict to be updated. @@ -666,17 +584,15 @@ def _apply_auth_params( The object type is the return value of sanitize_for_serialization(). :param auth_setting: auth settings for the endpoint """ - if auth_setting['in'] == 'cookie': - headers['Cookie'] = auth_setting['value'] - elif auth_setting['in'] == 'header': - if auth_setting['type'] != 'http-signature': - headers[auth_setting['key']] = auth_setting['value'] - elif auth_setting['in'] == 'query': - queries.append((auth_setting['key'], auth_setting['value'])) + if auth_setting["in"] == "cookie": + headers["Cookie"] = auth_setting["value"] + elif auth_setting["in"] == "header": + if auth_setting["type"] != "http-signature": + headers[auth_setting["key"]] = auth_setting["value"] + elif auth_setting["in"] == "query": + queries.append((auth_setting["key"], auth_setting["value"])) else: - raise ApiValueError( - 'Authentication token must be in `query` or `header`' - ) + raise ApiValueError("Authentication token must be in `query` or `header`") def __deserialize_file(self, response): """Deserializes body to file @@ -696,10 +612,7 @@ def __deserialize_file(self, response): content_disposition = response.getheader("Content-Disposition") if content_disposition: - m = re.search( - r'filename=[\'"]?([^\'"\s]+)[\'"]?', - content_disposition - ) + m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) assert m is not None, "Unexpected 'content-disposition' header value" filename = m.group(1) path = os.path.join(os.path.dirname(path), filename) @@ -742,10 +655,7 @@ def __deserialize_date(self, string): except ImportError: return string except ValueError: - raise rest.ApiException( - status=0, - reason="Failed to parse `{0}` as date object".format(string) - ) + raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) def __deserialize_datetime(self, string): """Deserializes string to datetime. @@ -760,13 +670,7 @@ def __deserialize_datetime(self, string): except ImportError: return string except ValueError: - raise rest.ApiException( - status=0, - reason=( - "Failed to parse `{0}` as datetime object" - .format(string) - ) - ) + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) def __deserialize_enum(self, data, klass): """Deserializes primitive type to enum. @@ -778,13 +682,7 @@ def __deserialize_enum(self, data, klass): try: return klass(data) except ValueError: - raise rest.ApiException( - status=0, - reason=( - "Failed to parse `{0}` as `{1}`" - .format(data, klass) - ) - ) + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) def __deserialize_model(self, data, klass): """Deserializes list or dict to model. diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py index 9bc7c11..1ce1372 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -6,6 +6,7 @@ T = TypeVar("T") + class ApiResponse(BaseModel, Generic[T]): """ API response object @@ -16,6 +17,4 @@ class ApiResponse(BaseModel, Generic[T]): data: T = Field(description="Deserialized data given the data type") raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") - model_config = { - "arbitrary_types_allowed": True - } + model_config = {"arbitrary_types_allowed": True} diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py index 0b76ea2..2e80369 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -23,11 +23,19 @@ import http.client as httplib JSON_SCHEMA_VALIDATION_KEYWORDS = { - 'multipleOf', 'maximum', 'exclusiveMaximum', - 'minimum', 'exclusiveMinimum', 'maxLength', - 'minLength', 'pattern', 'maxItems', 'minItems' + "multipleOf", + "maximum", + "exclusiveMaximum", + "minimum", + "exclusiveMinimum", + "maxLength", + "minLength", + "pattern", + "maxItems", + "minItems", } + class Configuration: """This class contains various settings of the API client. @@ -63,20 +71,25 @@ class Configuration: _default = None - def __init__(self, host=None, - api_key=None, api_key_prefix=None, - username=None, password=None, - access_token=None, - server_index=None, server_variables=None, - server_operation_index=None, server_operation_variables=None, - ignore_operation_servers=False, - ssl_ca_cert=None, - retries=None, - *, - debug: Optional[bool] = None - ) -> None: - """Constructor - """ + def __init__( + self, + host=None, + api_key=None, + api_key_prefix=None, + username=None, + password=None, + access_token=None, + server_index=None, + server_variables=None, + server_operation_index=None, + server_operation_variables=None, + ignore_operation_servers=False, + ssl_ca_cert=None, + retries=None, + *, + debug: Optional[bool] = None + ) -> None: + """Constructor""" self._base_path = "http://localhost" if host is None else host """Default Base url """ @@ -122,7 +135,7 @@ def __init__(self, host=None, """ self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = '%(asctime)s %(levelname)s %(message)s' + self.logger_format = "%(asctime)s %(levelname)s %(message)s" """Log format """ self.logger_stream_handler = None @@ -177,7 +190,7 @@ def __init__(self, host=None, self.proxy_headers = None """Proxy headers """ - self.safe_chars_for_path_param = '' + self.safe_chars_for_path_param = "" """Safe chars for path_param """ self.retries = retries @@ -203,7 +216,7 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): - if k not in ('logger', 'logger_file_handler'): + if k not in ("logger", "logger_file_handler"): setattr(result, k, copy.deepcopy(v, memo)) # shallow copy of loggers result.logger = copy.copy(self.logger) @@ -363,9 +376,7 @@ def get_basic_auth_token(self): password = "" if self.password is not None: password = self.password - return urllib3.util.make_headers( - basic_auth=username + ':' + password - ).get('authorization') + return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") def auth_settings(self): """Gets Auth Settings dict for api client. @@ -380,12 +391,13 @@ def to_debug_report(self): :return: The report for debugging. """ - return "Python SDK Debug Report:\n"\ - "OS: {env}\n"\ - "Python Version: {pyversion}\n"\ - "Version of the API: 1.0.0\n"\ - "SDK Package Version: 1.0.0".\ - format(env=sys.platform, pyversion=sys.version) + return ( + "Python SDK Debug Report:\n" + "OS: {env}\n" + "Python Version: {pyversion}\n" + "Version of the API: 1.0.0\n" + "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) + ) def get_host_settings(self): """Gets an array of host settings @@ -394,8 +406,8 @@ def get_host_settings(self): """ return [ { - 'url': "", - 'description': "No description provided", + "url": "", + "description": "No description provided", } ] @@ -417,22 +429,20 @@ def get_host_from_settings(self, index, variables=None, servers=None): except IndexError: raise ValueError( "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers))) + "Must be less than {1}".format(index, len(servers)) + ) - url = server['url'] + url = server["url"] # go through variables and replace placeholders - for variable_name, variable in server.get('variables', {}).items(): - used_value = variables.get( - variable_name, variable['default_value']) + for variable_name, variable in server.get("variables", {}).items(): + used_value = variables.get(variable_name, variable["default_value"]) - if 'enum_values' in variable \ - and used_value not in variable['enum_values']: + if "enum_values" in variable and used_value not in variable["enum_values"]: raise ValueError( "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format( - variable_name, variables[variable_name], - variable['enum_values'])) + "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) + ) url = url.replace("{" + variable_name + "}", used_value) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py index a5adf00..5dbd4b0 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -1,27 +1,27 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 from typing import Any, Optional from typing_extensions import Self + class OpenApiException(Exception): """The base exception class for all OpenAPIExceptions""" class ApiTypeError(OpenApiException, TypeError): - def __init__(self, msg, path_to_item=None, valid_classes=None, - key_type=None) -> None: - """ Raises an exception for TypeErrors + def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: + """Raises an exception for TypeErrors Args: msg (str): the exception message @@ -104,9 +104,9 @@ def __init__(self, msg, path_to_item=None) -> None: class ApiException(OpenApiException): def __init__( - self, - status=None, - reason=None, + self, + status=None, + reason=None, http_resp=None, *, body: Optional[str] = None, @@ -125,17 +125,17 @@ def __init__( self.reason = http_resp.reason if self.body is None: try: - self.body = http_resp.data.decode('utf-8') + self.body = http_resp.data.decode("utf-8") except Exception: pass self.headers = http_resp.getheaders() @classmethod def from_response( - cls, - *, - http_resp, - body: Optional[str], + cls, + *, + http_resp, + body: Optional[str], data: Optional[Any], ) -> Self: if http_resp.status == 400: @@ -156,11 +156,9 @@ def from_response( def __str__(self): """Custom error messages for exception""" - error_message = "({0})\n"\ - "Reason: {1}\n".format(self.status, self.reason) + error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) if self.headers: - error_message += "HTTP response headers: {0}\n".format( - self.headers) + error_message += "HTTP response headers: {0}\n".format(self.headers) if self.data or self.body: error_message += "HTTP response body: {0}\n".format(self.data or self.body) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py index 022896f..e0ef19f 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -2,19 +2,18 @@ # flake8: noqa """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 # import models into model package from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py index b797b12..cd0f9c7 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,20 +19,16 @@ class ContentType(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - IMAGE = 'IMAGE' - TABLE = 'TABLE' - TEXT = 'TEXT' + IMAGE = "IMAGE" + TABLE = "TABLE" + TEXT = "TEXT" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py index db65003..4f9f9af 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -23,10 +23,10 @@ from typing import Optional, Set from typing_extensions import Self + class ExtractionRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr kwargs: Optional[List[KeyValuePair]] = None @@ -38,7 +38,6 @@ class ExtractionRequest(BaseModel): protected_namespaces=(), ) - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -63,8 +62,7 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([ - ]) + excluded_fields: Set[str] = set([]) _dict = self.model_dump( by_alias=True, @@ -77,7 +75,7 @@ def to_dict(self) -> Dict[str, Any]: for _item_kwargs in self.kwargs: if _item_kwargs: _items.append(_item_kwargs.to_dict()) - _dict['kwargs'] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -89,11 +87,15 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] if obj.get("kwargs") is not None else None - }) + _obj = cls.model_validate( + { + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] + if obj.get("kwargs") is not None + else None + ), + } + ) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index 95a0fdb..a428183 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -24,10 +24,12 @@ from typing import Optional, Set from typing_extensions import Self + class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 + """ # noqa: E501 + metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -39,7 +41,6 @@ class InformationPiece(BaseModel): protected_namespaces=(), ) - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -64,8 +65,7 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([ - ]) + excluded_fields: Set[str] = set([]) _dict = self.model_dump( by_alias=True, @@ -78,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: for _item_metadata in self.metadata: if _item_metadata: _items.append(_item_metadata.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -90,11 +90,15 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] if obj.get("metadata") is not None else None, - "page_content": obj.get("page_content"), - "type": obj.get("type") - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj["metadata"]] + if obj.get("metadata") is not None + else None + ), + "page_content": obj.get("page_content"), + "type": obj.get("type"), + } + ) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py index 553288b..2a77b65 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -22,10 +22,10 @@ from typing import Optional, Set from typing_extensions import Self + class KeyValuePair(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -36,7 +36,6 @@ class KeyValuePair(BaseModel): protected_namespaces=(), ) - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -61,8 +60,7 @@ def to_dict(self) -> Dict[str, Any]: were set at model initialization. Other fields with value `None` are ignored. """ - excluded_fields: Set[str] = set([ - ]) + excluded_fields: Set[str] = set([]) _dict = self.model_dump( by_alias=True, @@ -72,12 +70,12 @@ def to_dict(self) -> Dict[str, Any]: # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict['key'] = None + _dict["key"] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict['value'] = None + _dict["value"] = None return _dict @@ -90,10 +88,5 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py index 32b1c3a..60fc660 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -78,22 +78,19 @@ def __init__(self, configuration) -> None: "key_file": configuration.key_file, } if configuration.assert_hostname is not None: - pool_args['assert_hostname'] = ( - configuration.assert_hostname - ) + pool_args["assert_hostname"] = configuration.assert_hostname if configuration.retries is not None: - pool_args['retries'] = configuration.retries + pool_args["retries"] = configuration.retries if configuration.tls_server_name: - pool_args['server_hostname'] = configuration.tls_server_name - + pool_args["server_hostname"] = configuration.tls_server_name if configuration.socket_options is not None: - pool_args['socket_options'] = configuration.socket_options + pool_args["socket_options"] = configuration.socket_options if configuration.connection_pool_maxsize is not None: - pool_args['maxsize'] = configuration.connection_pool_maxsize + pool_args["maxsize"] = configuration.connection_pool_maxsize # https pool manager self.pool_manager: urllib3.PoolManager @@ -101,6 +98,7 @@ def __init__(self, configuration) -> None: if configuration.proxy: if is_socks_proxy_url(configuration.proxy): from urllib3.contrib.socks import SOCKSProxyManager + pool_args["proxy_url"] = configuration.proxy pool_args["headers"] = configuration.proxy_headers self.pool_manager = SOCKSProxyManager(**pool_args) @@ -111,15 +109,7 @@ def __init__(self, configuration) -> None: else: self.pool_manager = urllib3.PoolManager(**pool_args) - def request( - self, - method, - url, - headers=None, - body=None, - post_params=None, - _request_timeout=None - ): + def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): """Perform requests. :param method: http request method @@ -135,20 +125,10 @@ def request( (connection, read) timeouts. """ method = method.upper() - assert method in [ - 'GET', - 'HEAD', - 'DELETE', - 'POST', - 'PUT', - 'PATCH', - 'OPTIONS' - ] + assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] if post_params and body: - raise ApiValueError( - "body parameter cannot be used with post_params parameter." - ) + raise ApiValueError("body parameter cannot be used with post_params parameter.") post_params = post_params or {} headers = headers or {} @@ -157,37 +137,23 @@ def request( if _request_timeout: if isinstance(_request_timeout, (int, float)): timeout = urllib3.Timeout(total=_request_timeout) - elif ( - isinstance(_request_timeout, tuple) - and len(_request_timeout) == 2 - ): - timeout = urllib3.Timeout( - connect=_request_timeout[0], - read=_request_timeout[1] - ) + elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: + timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) try: # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ['POST', 'PUT', 'PATCH', 'OPTIONS', 'DELETE']: + if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: # no content type provided or payload is json - content_type = headers.get('Content-Type') - if ( - not content_type - or re.search('json', content_type, re.IGNORECASE) - ): + content_type = headers.get("Content-Type") + if not content_type or re.search("json", content_type, re.IGNORECASE): request_body = None if body is not None: request_body = json.dumps(body) r = self.pool_manager.request( - method, - url, - body=request_body, - timeout=timeout, - headers=headers, - preload_content=False + method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False ) - elif content_type == 'application/x-www-form-urlencoded': + elif content_type == "application/x-www-form-urlencoded": r = self.pool_manager.request( method, url, @@ -195,15 +161,15 @@ def request( encode_multipart=False, timeout=timeout, headers=headers, - preload_content=False + preload_content=False, ) - elif content_type == 'multipart/form-data': + elif content_type == "multipart/form-data": # must del headers['Content-Type'], or the correct # Content-Type which generated by urllib3 will be # overwritten. - del headers['Content-Type'] + del headers["Content-Type"] # Ensures that dict objects are serialized - post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a,b) for a, b in post_params] + post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] r = self.pool_manager.request( method, url, @@ -211,29 +177,20 @@ def request( encode_multipart=True, timeout=timeout, headers=headers, - preload_content=False + preload_content=False, ) # Pass a `string` parameter directly in the body to support # other content types than JSON when `body` argument is # provided in serialized form. elif isinstance(body, str) or isinstance(body, bytes): r = self.pool_manager.request( - method, - url, - body=body, - timeout=timeout, - headers=headers, - preload_content=False + method, url, body=body, timeout=timeout, headers=headers, preload_content=False ) - elif headers['Content-Type'].startswith('text/') and isinstance(body, bool): + elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): request_body = "true" if body else "false" r = self.pool_manager.request( - method, - url, - body=request_body, - preload_content=False, - timeout=timeout, - headers=headers) + method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers + ) else: # Cannot generate the request from given parameters msg = """Cannot prepare a request message for provided @@ -243,12 +200,7 @@ def request( # For `GET`, `HEAD` else: r = self.pool_manager.request( - method, - url, - fields={}, - timeout=timeout, - headers=headers, - preload_content=False + method, url, fields={}, timeout=timeout, headers=headers, preload_content=False ) except urllib3.exceptions.SSLError as e: msg = "\n".join([type(e).__name__, str(e)]) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py index 9704fc8..5a78d9b 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + class TestContentType(unittest.TestCase): """ContentType unit test stubs""" @@ -29,5 +30,6 @@ def testContentType(self): """Test ContentType""" # inst = ContentType() -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py index fd48e16..2f8f1bd 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + class TestExtractionRequest(unittest.TestCase): """ExtractionRequest unit test stubs""" @@ -27,9 +28,9 @@ def tearDown(self): def make_instance(self, include_optional) -> ExtractionRequest: """Test ExtractionRequest - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included """ + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" # uncomment below to create an instance of `ExtractionRequest` """ model = ExtractionRequest() @@ -52,5 +53,6 @@ def testExtractionRequest(self): # inst_req_only = self.make_instance(include_optional=False) # inst_req_and_optional = self.make_instance(include_optional=True) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py index e76b68d..f39a507 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -27,11 +27,9 @@ def tearDown(self) -> None: pass def test_extract_from_file_post(self) -> None: - """Test case for extract_from_file_post - - """ + """Test case for extract_from_file_post""" pass -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py index 0661af0..479c858 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + class TestInformationPiece(unittest.TestCase): """InformationPiece unit test stubs""" @@ -27,9 +28,9 @@ def tearDown(self): def make_instance(self, include_optional) -> InformationPiece: """Test InformationPiece - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included """ + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" # uncomment below to create an instance of `InformationPiece` """ model = InformationPiece() @@ -56,5 +57,6 @@ def testInformationPiece(self): # inst_req_only = self.make_instance(include_optional=False) # inst_req_and_optional = self.make_instance(include_optional=True) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py index 695ebb9..0ddc864 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -16,6 +16,7 @@ from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + class TestKeyValuePair(unittest.TestCase): """KeyValuePair unit test stubs""" @@ -27,9 +28,9 @@ def tearDown(self): def make_instance(self, include_optional) -> KeyValuePair: """Test KeyValuePair - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included """ + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" # uncomment below to create an instance of `KeyValuePair` """ model = KeyValuePair() @@ -48,5 +49,6 @@ def testKeyValuePair(self): # inst_req_only = self.make_instance(include_optional=False) # inst_req_and_optional = self.make_instance(include_optional=True) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index d00dfce..fedce07 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List from admin_api_lib.models.status import Status + try: from typing import Self except ImportError: from typing_extensions import Self + class DocumentStatus(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + name: StrictStr status: Status __properties: ClassVar[List[str]] = ["name", "status"] @@ -42,7 +41,6 @@ class DocumentStatus(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -69,8 +67,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) return _dict @@ -84,10 +81,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "name": obj.get("name"), - "status": obj.get("status") - }) + _obj = cls.model_validate({"name": obj.get("name"), "status": obj.get("status")}) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/models/extra_models.py b/admin-api-lib/src/admin_api_lib/models/extra_models.py index a3a283f..f0588d2 100644 --- a/admin-api-lib/src/admin_api_lib/models/extra_models.py +++ b/admin-api-lib/src/admin_api_lib/models/extra_models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel + class TokenModel(BaseModel): """Defines a token model.""" diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py index 8419cfa..2d2fe5e 100644 --- a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,19 +18,18 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List, Optional + try: from typing import Self except ImportError: from typing_extensions import Self + class KeyValuePair(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -41,7 +40,6 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,19 +66,18 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict['key'] = None + _dict["key"] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict['value'] = None + _dict["value"] = None return _dict @@ -93,10 +90,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 2e0de2c..5c7836f 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,21 +26,17 @@ class Status(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - UPLOADING = 'UPLOADING' - PROCESSING = 'PROCESSING' - READY = 'READY' - ERROR = 'ERROR' + UPLOADING = "UPLOADING" + PROCESSING = "PROCESSING" + READY = "READY" + ERROR = "ERROR" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of Status from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py index f76b987..e90690f 100644 --- a/admin-api-lib/src/admin_api_lib/models/upload_source.py +++ b/admin-api-lib/src/admin_api_lib/models/upload_source.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - admin-api-lib +admin-api-lib - The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from admin_api_lib.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class UploadSource(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr kwargs: Optional[List[KeyValuePair]] = None @@ -43,7 +42,6 @@ class UploadSource(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -70,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) @@ -80,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.kwargs: if _item: _items.append(_item.to_dict()) - _dict['kwargs'] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -92,11 +89,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None - }) + _obj = cls.model_validate( + { + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None + else None + ), + } + ) return _obj - - diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index d949eb7..262f11b 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -7,10 +7,10 @@ servers: paths: /extract: post: - operationId: extract_from_file_post + operationId: extract requestBody: content: - application/json: + multipart/form-data: schema: $ref: '#/components/schemas/extraction_request' required: true @@ -24,7 +24,7 @@ paths: type: array description: List of extracted information. "422": - description: Body is not a valid PDF. + description: Body is not a valid source. "500": description: Something somewhere went terribly wrong. tags: @@ -87,19 +87,19 @@ components: file: description: "" format: binary - title: file type: string type: description: "" - title: type type: string kwargs: description: "" items: $ref: '#/components/schemas/key_value_pair' - title: kwargs type: array + name: + description: "" + type: string required: + - name - type - title: extraction_request type: object diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 6246635..eee5ada 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -23,9 +23,11 @@ ) from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 -from typing import Any, List -from extractor_api_lib.models.extraction_request import ExtractionRequest +from pydantic import StrictBytes, StrictStr +from fastapi import Request, Response, UploadFile +from typing import Any, List, Optional, Tuple, Union from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair router = APIRouter() @@ -39,15 +41,18 @@ "/extract", responses={ 200: {"model": List[InformationPiece], "description": "List of extracted information."}, - 422: {"description": "Body is not a valid PDF."}, + 422: {"description": "Body is not a valid source."}, 500: {"description": "Something somewhere went terribly wrong."}, }, tags=["extractor"], response_model_by_alias=True, ) -async def extract_from_file_post( - extraction_request: ExtractionRequest = Body(None, description=""), +async def extract( + type: StrictStr = Form(None, description=""), + name: StrictStr = Form(None, description=""), + file: Optional[UploadFile] = Form(None, description=""), + kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) + return await BaseExtractorApi.subclasses[0]().extract(type, name, file, kwargs) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index a0b1fb5..f7a7cf0 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -2,9 +2,11 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from typing import Any, List -from extractor_api_lib.models.extraction_request import ExtractionRequest +from pydantic import StrictBytes, StrictStr +from typing import Any, List, Optional, Tuple, Union +from fastapi import Request, Response, UploadFile from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair class BaseExtractorApi: @@ -13,8 +15,11 @@ class BaseExtractorApi: def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - async def extract_from_file_post( + + async def extract( self, - extraction_request: ExtractionRequest, - ) -> List[InformationPiece]: - ... + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], + ) -> List[InformationPiece]: ... diff --git a/extractor-api-lib/src/extractor_api_lib/models/content_type.py b/extractor-api-lib/src/extractor_api_lib/models/content_type.py index 195f424..ff7be41 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/content_type.py +++ b/extractor-api-lib/src/extractor_api_lib/models/content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,20 +26,16 @@ class ContentType(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - IMAGE = 'IMAGE' - TABLE = 'TABLE' - TEXT = 'TEXT' + IMAGE = "IMAGE" + TABLE = "TABLE" + TEXT = "TEXT" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py index a3a283f..f0588d2 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extra_models.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extra_models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel + class TokenModel(BaseModel): """Defines a token model.""" diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 437442f..8917378 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from extractor_api_lib.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class ExtractionRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr kwargs: Optional[List[KeyValuePair]] = None @@ -43,7 +42,6 @@ class ExtractionRequest(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -70,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) @@ -80,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.kwargs: if _item: _items.append(_item.to_dict()) - _dict['kwargs'] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -92,11 +89,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None - }) + _obj = cls.model_validate( + { + "file": obj.get("file"), + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None + else None + ), + } + ) return _obj - - diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 98261ff..8890a13 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,21 +18,22 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List from extractor_api_lib.models.content_type import ContentType from extractor_api_lib.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class InformationPiece(BaseModel): """ A piece of information that has been extracted. - """ # noqa: E501 + """ # noqa: E501 + metadata: List[KeyValuePair] page_content: StrictStr type: ContentType @@ -44,7 +45,6 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -71,8 +71,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -81,7 +80,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -93,11 +92,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, - "page_content": obj.get("page_content"), - "type": obj.get("type") - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] + if obj.get("metadata") is not None + else None + ), + "page_content": obj.get("page_content"), + "type": obj.get("type"), + } + ) return _obj - - diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index 0cf865e..f751313 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - extractor-api-lib +extractor-api-lib - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,19 +18,18 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List, Optional + try: from typing import Self except ImportError: from typing_extensions import Self + class KeyValuePair(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + key: Optional[Any] = None value: Optional[Any] = None __properties: ClassVar[List[str]] = ["key", "value"] @@ -41,7 +40,6 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,19 +66,18 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # set to None if key (nullable) is None # and model_fields_set contains the field if self.key is None and "key" in self.model_fields_set: - _dict['key'] = None + _dict["key"] = None # set to None if value (nullable) is None # and model_fields_set contains the field if self.value is None and "value" in self.model_fields_set: - _dict['value'] = None + _dict["value"] = None return _dict @@ -93,10 +90,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index 425f48c..64597dd 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -12,7 +12,7 @@ from fastapi import ( # noqa: F401 APIRouter, - BackgroundTasks, + BackgroundTasks, Body, Cookie, Depends, @@ -21,7 +21,7 @@ HTTPException, Path, Query, - Request, + Request, Response, Security, status, @@ -57,6 +57,7 @@ async def _disconnected(request: Request) -> None: except CancelledError: break + @router.post( "/chat/{session_id}", responses={ @@ -69,7 +70,9 @@ async def _disconnected(request: Request) -> None: async def chat( request: Request, session_id: StrictStr = Path(..., description=""), - chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body(None, description="Chat with RAG."), + chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body( + None, description="Chat with RAG." + ), ) -> ChatResponse | None: """ Asynchronously handles the chat endpoint for the RAG API. @@ -125,8 +128,7 @@ async def chat( tags=["rag"], response_model_by_alias=True, ) -async def evaluate( -) -> None: +async def evaluate() -> None: """ Asynchronously evaluate the RAG. @@ -167,7 +169,7 @@ async def remove_information_piece( Returns ------- None - """ + """ if not BaseRagApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().remove_information_piece(delete_request) @@ -200,7 +202,7 @@ async def upload_information_piece( Returns ------- None - """ + """ if not BaseRagApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().upload_information_piece(information_piece) diff --git a/rag-core-api/src/rag_core_api/apis/rag_api_base.py b/rag-core-api/src/rag_core_api/apis/rag_api_base.py index 70d1406..0b53f4b 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api_base.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api_base.py @@ -24,11 +24,13 @@ class BaseRagApi: subclasses : ClassVar[Tuple] A tuple that holds all subclasses of BaseRagApi. """ + subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseRagApi.subclasses = BaseRagApi.subclasses + (cls,) + async def chat( self, session_id: StrictStr, @@ -52,7 +54,6 @@ async def chat( The chat response if the chat task completes successfully, otherwise None. """ - async def evaluate( self, ) -> None: @@ -64,7 +65,6 @@ async def evaluate( None """ - async def remove_information_piece( self, delete_request: DeleteRequest, @@ -84,7 +84,6 @@ async def remove_information_piece( None """ - async def upload_information_piece( self, information_piece: List[InformationPiece], diff --git a/rag-core-api/src/rag_core_api/models/chat_history.py b/rag-core-api/src/rag_core_api/models/chat_history.py index 71e2e8c..9087afe 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history.py +++ b/rag-core-api/src/rag_core_api/models/chat_history.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List from rag_core_api.models.chat_history_message import ChatHistoryMessage + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatHistory(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + messages: List[ChatHistoryMessage] __properties: ClassVar[List[str]] = ["messages"] @@ -41,7 +40,6 @@ class ChatHistory(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,8 +66,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in messages (list) @@ -78,7 +75,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.messages: if _item: _items.append(_item.to_dict()) - _dict['messages'] = _items + _dict["messages"] = _items return _dict @classmethod @@ -90,9 +87,13 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "messages": [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] if obj.get("messages") is not None else None - }) + _obj = cls.model_validate( + { + "messages": ( + [ChatHistoryMessage.from_dict(_item) for _item in obj.get("messages")] + if obj.get("messages") is not None + else None + ) + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_history_message.py b/rag-core-api/src/rag_core_api/models/chat_history_message.py index 59da140..c9d782b 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history_message.py +++ b/rag-core-api/src/rag_core_api/models/chat_history_message.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List from rag_core_api.models.chat_role import ChatRole + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatHistoryMessage(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + role: ChatRole message: StrictStr __properties: ClassVar[List[str]] = ["role", "message"] @@ -42,7 +41,6 @@ class ChatHistoryMessage(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -69,8 +67,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) return _dict @@ -84,10 +81,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "role": obj.get("role"), - "message": obj.get("message") - }) + _obj = cls.model_validate({"role": obj.get("role"), "message": obj.get("message")}) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_request.py b/rag-core-api/src/rag_core_api/models/chat_request.py index 9e28631..66090ef 100644 --- a/rag-core-api/src/rag_core_api/models/chat_request.py +++ b/rag-core-api/src/rag_core_api/models/chat_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List, Optional from rag_core_api.models.chat_history import ChatHistory + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + history: Optional[ChatHistory] = None message: StrictStr __properties: ClassVar[List[str]] = ["history", "message"] @@ -42,7 +41,6 @@ class ChatRequest(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -69,13 +67,12 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of history if self.history: - _dict['history'] = self.history.to_dict() + _dict["history"] = self.history.to_dict() return _dict @classmethod @@ -87,10 +84,10 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, - "message": obj.get("message") - }) + _obj = cls.model_validate( + { + "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, + "message": obj.get("message"), + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_response.py b/rag-core-api/src/rag_core_api/models/chat_response.py index 6a8daad..ba8c6b1 100644 --- a/rag-core-api/src/rag_core_api/models/chat_response.py +++ b/rag-core-api/src/rag_core_api/models/chat_response.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List from rag_core_api.models.information_piece import InformationPiece + try: from typing import Self except ImportError: from typing_extensions import Self + class ChatResponse(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + answer: StrictStr finish_reason: StrictStr = Field(description=" ") citations: List[InformationPiece] @@ -43,7 +42,6 @@ class ChatResponse(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -70,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in citations (list) @@ -80,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.citations: if _item: _items.append(_item.to_dict()) - _dict['citations'] = _items + _dict["citations"] = _items return _dict @classmethod @@ -92,11 +89,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "answer": obj.get("answer"), - "finish_reason": obj.get("finish_reason"), - "citations": [InformationPiece.from_dict(_item) for _item in obj.get("citations")] if obj.get("citations") is not None else None - }) + _obj = cls.model_validate( + { + "answer": obj.get("answer"), + "finish_reason": obj.get("finish_reason"), + "citations": ( + [InformationPiece.from_dict(_item) for _item in obj.get("citations")] + if obj.get("citations") is not None + else None + ), + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/chat_role.py b/rag-core-api/src/rag_core_api/models/chat_role.py index d0bef70..7e1c88d 100644 --- a/rag-core-api/src/rag_core_api/models/chat_role.py +++ b/rag-core-api/src/rag_core_api/models/chat_role.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,19 +26,15 @@ class ChatRole(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - USER = 'user' - ASSISTANT = 'assistant' + USER = "user" + ASSISTANT = "assistant" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ChatRole from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/rag-core-api/src/rag_core_api/models/content_type.py b/rag-core-api/src/rag_core_api/models/content_type.py index df72d7d..7f4d874 100644 --- a/rag-core-api/src/rag_core_api/models/content_type.py +++ b/rag-core-api/src/rag_core_api/models/content_type.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -19,7 +19,6 @@ from enum import Enum - try: from typing import Self except ImportError: @@ -27,21 +26,17 @@ class ContentType(str, Enum): - """ - - """ + """ """ """ allowed enum values """ - TEXT = 'TEXT' - IMAGE = 'IMAGE' - TABLE = 'TABLE' - SUMMARY = 'SUMMARY' + TEXT = "TEXT" + IMAGE = "IMAGE" + TABLE = "TABLE" + SUMMARY = "SUMMARY" @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of ContentType from a JSON string""" return cls(json.loads(json_str)) - - diff --git a/rag-core-api/src/rag_core_api/models/delete_request.py b/rag-core-api/src/rag_core_api/models/delete_request.py index 2c3592c..8b40339 100644 --- a/rag-core-api/src/rag_core_api/models/delete_request.py +++ b/rag-core-api/src/rag_core_api/models/delete_request.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,20 +18,19 @@ import json - - from pydantic import BaseModel, ConfigDict from typing import Any, ClassVar, Dict, List, Optional from rag_core_api.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class DeleteRequest(BaseModel): - """ - - """ # noqa: E501 + """ """ # noqa: E501 + metadata: Optional[List[KeyValuePair]] = None __properties: ClassVar[List[str]] = ["metadata"] @@ -41,7 +40,6 @@ class DeleteRequest(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,8 +66,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -78,7 +75,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -90,9 +87,13 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] + if obj.get("metadata") is not None + else None + ) + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/extra_models.py b/rag-core-api/src/rag_core_api/models/extra_models.py index a3a283f..f0588d2 100644 --- a/rag-core-api/src/rag_core_api/models/extra_models.py +++ b/rag-core-api/src/rag_core_api/models/extra_models.py @@ -2,6 +2,7 @@ from pydantic import BaseModel + class TokenModel(BaseModel): """Defines a token model.""" diff --git a/rag-core-api/src/rag_core_api/models/information_piece.py b/rag-core-api/src/rag_core_api/models/information_piece.py index 28d5115..dfe8a42 100644 --- a/rag-core-api/src/rag_core_api/models/information_piece.py +++ b/rag-core-api/src/rag_core_api/models/information_piece.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,22 +18,25 @@ import json - - from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List from rag_core_api.models.content_type import ContentType from rag_core_api.models.key_value_pair import KeyValuePair + try: from typing import Self except ImportError: from typing_extensions import Self + class InformationPiece(BaseModel): """ Uploading a json with chunks and metadata. - """ # noqa: E501 - metadata: List[KeyValuePair] = Field(description="The metadata of the documents that are stored in the vectordatabase.") + """ # noqa: E501 + + metadata: List[KeyValuePair] = Field( + description="The metadata of the documents that are stored in the vectordatabase." + ) page_content: StrictStr = Field(description="The content of the document") type: ContentType __properties: ClassVar[List[str]] = ["metadata", "page_content", "type"] @@ -44,7 +47,6 @@ class InformationPiece(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -71,8 +73,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) # override the default output from pydantic by calling `to_dict()` of each item in metadata (list) @@ -81,7 +82,7 @@ def to_dict(self) -> Dict[str, Any]: for _item in self.metadata: if _item: _items.append(_item.to_dict()) - _dict['metadata'] = _items + _dict["metadata"] = _items return _dict @classmethod @@ -93,11 +94,15 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "metadata": [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] if obj.get("metadata") is not None else None, - "page_content": obj.get("page_content"), - "type": obj.get("type") - }) + _obj = cls.model_validate( + { + "metadata": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("metadata")] + if obj.get("metadata") is not None + else None + ), + "page_content": obj.get("page_content"), + "type": obj.get("type"), + } + ) return _obj - - diff --git a/rag-core-api/src/rag_core_api/models/key_value_pair.py b/rag-core-api/src/rag_core_api/models/key_value_pair.py index b9654c3..3079959 100644 --- a/rag-core-api/src/rag_core_api/models/key_value_pair.py +++ b/rag-core-api/src/rag_core_api/models/key_value_pair.py @@ -1,14 +1,14 @@ # coding: utf-8 """ - RAG SIT x Stackit +RAG SIT x Stackit - The perfect rag solution. +The perfect rag solution. - The version of the OpenAPI document: 1.0.0 - Generated by OpenAPI Generator (https://openapi-generator.tech) +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) - Do not edit the class manually. +Do not edit the class manually. """ # noqa: E501 @@ -18,19 +18,20 @@ import json - - from pydantic import BaseModel, ConfigDict, Field, StrictStr from typing import Any, ClassVar, Dict, List + try: from typing import Self except ImportError: from typing_extensions import Self + class KeyValuePair(BaseModel): """ The key value pair. - """ # noqa: E501 + """ # noqa: E501 + key: StrictStr value: StrictStr = Field(description=" ") __properties: ClassVar[List[str]] = ["key", "value"] @@ -41,7 +42,6 @@ class KeyValuePair(BaseModel): "protected_namespaces": (), } - def to_str(self) -> str: """Returns the string representation of the model using alias""" return pprint.pformat(self.model_dump(by_alias=True)) @@ -68,8 +68,7 @@ def to_dict(self) -> Dict[str, Any]: """ _dict = self.model_dump( by_alias=True, - exclude={ - }, + exclude={}, exclude_none=True, ) return _dict @@ -83,10 +82,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate({ - "key": obj.get("key"), - "value": obj.get("value") - }) + _obj = cls.model_validate({"key": obj.get("key"), "value": obj.get("value")}) return _obj - - From 1a7b9d700d72f8f92c69f3ecac95af0a442e9ce7 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 9 May 2025 15:40:25 +0200 Subject: [PATCH 03/56] switch to one uploader for all types --- .../api_endpoints/source_uploader.py | 14 +- .../src/admin_api_lib/apis/admin_api.py | 3 +- .../src/admin_api_lib/apis/admin_api_base.py | 1 - .../src/admin_api_lib/dependency_container.py | 22 +-- .../src/admin_api_lib/impl/admin_api.py | 56 +++----- .../api_endpoints/default_source_uploader.py | 125 ++++++++++++++++++ 6 files changed, 158 insertions(+), 63 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 2cfbf2f..0c9b73e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,8 +1,10 @@ -from dataclasses import Field -from typing_extensions import Annotated from abc import ABC, abstractmethod +from typing import Optional -from admin_api_lib.models.upload_source import UploadSource +from pydantic import StrictStr +from fastapi import UploadFile + +from admin_api_lib.models.key_value_pair import KeyValuePair class SourceUploader(ABC): @@ -10,5 +12,9 @@ class SourceUploader(ABC): @abstractmethod async def upload_source( self, - upload_source: Annotated[UploadSource, Field(description="The source to upload.")], + base_url: str, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 81d55f5..ccaed84 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -99,6 +99,7 @@ async def document_reference_id_get( raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) + @router.get( "/all_documents_status", responses={ @@ -116,7 +117,7 @@ async def get_all_documents_status() -> List[DocumentStatus]: ------- list[DocumentStatus] A list containing the status of all documents. - """ + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().get_all_documents_status() diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 34bce77..48e22dc 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -52,7 +52,6 @@ async def document_reference_id_get( The response object containing the document reference details. """ - async def get_all_documents_status( self, ) -> list[DocumentStatus]: diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 9079a47..4ca3b57 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,5 +1,6 @@ """Module for the DependencyContainer class.""" +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -164,23 +165,12 @@ class DependencyContainer(DeclarativeContainer): DefaultDocumentDeleter, rag_api=rag_api, file_service=file_service, key_value_store=key_value_store ) documents_status_retriever = Singleton(DefaultDocumentsStatusRetriever, key_value_store=key_value_store) - confluence_loader = Singleton( - DefaultConfluenceLoader, - extractor_api=document_extractor, - rag_api=rag_api, - key_value_store=key_value_store, - settings=confluence_settings, - information_enhancer=information_enhancer, - information_mapper=information_mapper, - chunker=chunker, - document_deleter=document_deleter, - settings_mapper=confluence_settings_mapper, - ) + document_reference_retriever = Singleton(DefaultDocumentReferenceRetriever, file_service=file_service) - document_uploader = Singleton( - DefaultDocumentUploader, - document_extractor=document_extractor, - file_service=file_service, + + source_uploader = Singleton( + DefaultSourceUploader, + extractor_api=document_extractor, rag_api=rag_api, information_enhancer=information_enhancer, information_mapper=information_mapper, diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 9c24eba..25745c5 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -1,7 +1,14 @@ """Module containing the implementation of the Admin API.""" +from dataclasses import Field import logging +from typing import List, Optional +from typing_extensions import Annotated +from pydantic import Field, StrictBytes, StrictStr +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.models.upload_source import UploadSource from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile @@ -85,24 +92,16 @@ async def get_all_documents_status( return await document_status_retriever.aget_all_documents_status() @inject - async def load_confluence_post( + async def upload_source( self, - confluence_loader: ConfluenceLoader = Depends(Provide[DependencyContainer.confluence_loader]), + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], + request: Request, + source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: - """ - Asynchronously loads a Confluence space using the provided ConfluenceLoader. - - Parameters - ---------- - confluence_loader : ConfluenceLoader - The ConfluenceLoader instance to use for loading the post. This is injected by dependency injection - (default is Depends(Provide[DependencyContainer.confluence_loader])). - - Returns - ------- - None - """ - await confluence_loader.aload_from_confluence() + await source_uploader.upload_source(str(request.base_url), type, name, file, kwargs) @inject async def document_reference_id_get( @@ -129,28 +128,3 @@ async def document_reference_id_get( The document in binary form. """ return await document_reference_retriever.adocument_reference_id_get(identification) - - @inject - async def upload_documents_post( - self, - body: UploadFile, - request: Request, - document_uploader: DocumentUploader = Depends(Provide[DependencyContainer.document_uploader]), - ) -> None: - """ - Handle the POST request to upload documents. - - Parameters - ---------- - body : UploadFile - The file to be uploaded. - request : Request - The request object containing metadata about the request. - document_uploader : DocumentUploader, optional - The document uploader dependency, by default provided by DependencyContainer. - - Returns - ------- - None - """ - await document_uploader.aupload_documents_post(body, request) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py new file mode 100644 index 0000000..1b2f31c --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -0,0 +1,125 @@ +from http.client import HTTPException +import logging +from typing import Optional +from threading import Thread +import urllib + +from pydantic import StrictStr +from fastapi import UploadFile, status +from langchain_core.documents import Document +from asyncio import run + +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi +from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document +from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.chunker.chunker import Chunker +from admin_api_lib.models.status import Status +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore +from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer +from admin_api_lib.utils.utils import sanitize_document_name + +logger = logging.getLogger(__name__) + + +class DefaultSourceUploader(SourceUploader): + + def __init__( + self, + extractor_api: ExtractorApi, + key_value_store: FileStatusKeyValueStore, + information_enhancer: InformationEnhancer, + chunker: Chunker, + document_deleter: DocumentDeleter, + rag_api: RagApi, + information_mapper: InformationPiece2Document, + ): + self._extractor_api = extractor_api + self._rag_api = rag_api + self._key_value_store = key_value_store + self._information_mapper = information_mapper + self._information_enhancer = information_enhancer + self._chunker = chunker + self._document_deleter = document_deleter + self._background_threads = [] + + async def upload_source( + self, + base_url: str, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> None: + self._background_threads = [t for t in self._background_threads if t.is_alive()] + source_name = f"{type}:{sanitize_document_name(name)}" + try: + # TODO: check if document already in processing state + self._key_value_store.upsert( + source_name, Status.PROCESSING + ) # TODO: change to pipeline with timeout to error status + thread = Thread( + target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, file, kwargs)) + ) + thread.start() + self._background_threads.append(thread) + except ValueError as e: + self._key_value_store.upsert(source_name, Status.ERROR) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + async def _handle_source_upload( + self, + source_name: str, + base_url: str, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ): + try: + information_pieces = self._extractor_api.extract(type, name, file, kwargs) + if not information_pieces: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("No information pieces found in the document: %s", source_name) + documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + + chunked_documents = self._chunker.chunk(documents) + self._add_file_url(type, file, base_url, chunked_documents) + + enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + rag_information_pieces = [ + self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents + ] + + # Replace old document + await self._document_deleter.adelete_document(source_name) + self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) + logger.info("File uploaded successfully: %s", source_name) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + + def _add_file_url( + self, type: StrictStr, file: Optional[UploadFile], base_url: str, chunked_documents: list[Document] + ): + if type != "file": + return + + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" + for idx, chunk in enumerate(chunked_documents): + if chunk.metadata["id"] in chunk.metadata["related"]: + chunk.metadata["related"].remove(chunk.metadata["id"]) + chunk.metadata.update( + { + "chunk": idx, + "chunk_length": len(chunk.page_content), + "document_url": document_url, + } + ) From 7e4a9d0f793e47a52b1ffa3225ea503ad788b459 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 13 May 2025 15:06:49 +0200 Subject: [PATCH 04/56] extractor mostly working --- .../{confluence_extractor.py => extractor.py} | 19 ++-- .../api_endpoints/file_extractor.py | 26 ------ .../extractor_api_lib/apis/extractor_api.py | 14 +-- .../extractor_api_lib/dependency_container.py | 28 +++--- .../__init__.py | 0 .../extractors/information_extractor.py | 43 +++++++++ .../information_file_extractor.py} | 11 +-- .../impl/api_endpoints/default_extractor.py | 68 ++++++++++++++ .../api_endpoints/default_file_extractor.py | 65 -------------- .../impl/document_parser/general_extractor.py | 66 -------------- .../impl/extractor_api_impl.py | 48 ++++------ .../extractors}/__init__.py | 0 .../confluence_extractor.py} | 41 +++++---- .../file_extractors}/__init__.py | 0 .../file_extractors}/ms_docs_extractor.py | 24 ++--- .../file_extractors}/pdf_extractor.py | 18 ++-- .../file_extractors}/xml_extractor.py | 20 +++-- .../impl/extractors/general_file_extractor.py | 90 +++++++++++++++++++ .../impl/file_services/__init__.py | 0 .../internal2external_information_piece.py | 4 +- .../impl/types/extractor_types.py | 9 ++ ...piece.py => internal_information_piece.py} | 2 +- 22 files changed, 328 insertions(+), 268 deletions(-) rename extractor-api-lib/src/extractor_api_lib/api_endpoints/{confluence_extractor.py => extractor.py} (57%) delete mode 100644 extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py rename extractor-api-lib/src/extractor_api_lib/{document_parser => extractors}/__init__.py (100%) create mode 100644 extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py rename extractor-api-lib/src/extractor_api_lib/{document_parser/information_extractor.py => extractors/information_file_extractor.py} (78%) create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py delete mode 100644 extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py delete mode 100644 extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py rename extractor-api-lib/src/extractor_api_lib/{file_services => impl/extractors}/__init__.py (100%) rename extractor-api-lib/src/extractor_api_lib/impl/{api_endpoints/default_confluence_extractor.py => extractors/confluence_extractor.py} (58%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/__init__.py (100%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/ms_docs_extractor.py (89%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/pdf_extractor.py (94%) rename extractor-api-lib/src/extractor_api_lib/impl/{document_parser => extractors/file_extractors}/xml_extractor.py (83%) create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py delete mode 100644 extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py rename extractor-api-lib/src/extractor_api_lib/models/dataclasses/{information_piece.py => internal_information_piece.py} (92%) diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py similarity index 57% rename from extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py rename to extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py index d1aae80..c3f254b 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py @@ -1,16 +1,23 @@ -"""Module for the ConfluenceExtractor abstract base class.""" - from abc import ABC, abstractmethod +from typing import Optional + +from pydantic import StrictStr +from fastapi import UploadFile -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair -class ConfluenceExtractor(ABC): - """Abstract base class for extract_from_confluence endpoint.""" +class Extractor(ABC): @abstractmethod - async def aextract_from_confluence(self, confluence_parameters: ConfluenceParameters) -> list[InformationPiece]: + async def aextract_information( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InformationPiece]: """ Extract information from confluence, using the given confluence parameters. diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py deleted file mode 100644 index 523f159..0000000 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Module for the FileExtractor abstract base class.""" - -from abc import ABC, abstractmethod - -from extractor_api_lib.models.extraction_request import ExtractionRequest -from extractor_api_lib.models.information_piece import InformationPiece - - -class FileExtractor(ABC): - """Abstract base class for extract_information endpoint.""" - - @abstractmethod - async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: - """ - Extract information of a document, given by the extraction_request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing the details of the document to be processed for information extraction. - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the document. - """ diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index eee5ada..0cbdc2b 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,11 +1,11 @@ # coding: utf-8 -from typing import Dict, List # noqa: F401 +from typing import Annotated, Dict, List # noqa: F401 import importlib import pkgutil from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -import openapi_server.impl +import extractor_api_lib.impl from fastapi import ( # noqa: F401 APIRouter, @@ -32,7 +32,7 @@ router = APIRouter() -ns_pkg = openapi_server.impl +ns_pkg = extractor_api_lib.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) @@ -48,10 +48,10 @@ response_model_by_alias=True, ) async def extract( - type: StrictStr = Form(None, description=""), - name: StrictStr = Form(None, description=""), - file: Optional[UploadFile] = Form(None, description=""), - kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), + type: Annotated[str, Form()], + name: Annotated[str, Form()], + file: Optional[UploadFile] = None, + kwargs: Optional[Annotated[List[KeyValuePair], Form()]]=None, ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index e3bcaf1..2c5c53f 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -3,16 +3,12 @@ from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import List, Singleton # noqa: WOT001 -from extractor_api_lib.impl.api_endpoints.default_confluence_extractor import ( - DefaultConfluenceExtractor, -) -from extractor_api_lib.impl.api_endpoints.default_file_extractor import ( - DefaultFileExtractor, -) -from extractor_api_lib.impl.document_parser.general_extractor import GeneralExtractor -from extractor_api_lib.impl.document_parser.ms_docs_extractor import MSDocsExtractor -from extractor_api_lib.impl.document_parser.pdf_extractor import PDFExtractor -from extractor_api_lib.impl.document_parser.xml_extractor import XMLExtractor +from extractor_api_lib.impl.api_endpoints.default_extractor import DefaultExtractor +from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor +from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor +from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor +from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor +from extractor_api_lib.impl.extractors.general_file_extractor import GeneralFileExtractor from extractor_api_lib.impl.file_services.s3_service import S3Service from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, @@ -40,11 +36,13 @@ class DependencyContainer(DeclarativeContainer): intern2external = Singleton(Internal2ExternalInformationPiece) langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) - all_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) + file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_extractor = Singleton(GeneralExtractor, file_service, all_extractors) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors) + confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - file_extractor = Singleton( - DefaultFileExtractor, information_extractor=general_extractor, file_service=file_service, mapper=intern2external + default_extractor = Singleton( + DefaultExtractor, + mapper=intern2external, + available_extractors=List(general_file_extractor, confluence_extractor), ) - confluence_extractor = Singleton(DefaultConfluenceExtractor, mapper=langchain_document2information_piece) diff --git a/extractor-api-lib/src/extractor_api_lib/document_parser/__init__.py b/extractor-api-lib/src/extractor_api_lib/extractors/__init__.py similarity index 100% rename from extractor-api-lib/src/extractor_api_lib/document_parser/__init__.py rename to extractor-api-lib/src/extractor_api_lib/extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py new file mode 100644 index 0000000..eeaadf1 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -0,0 +1,43 @@ +"""Module for the Base class for Information extractors.""" + +from abc import ABC, abstractmethod +from typing import Optional + + +from fastapi import UploadFile +from pydantic import StrictStr + +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + + +class InformationExtractor(ABC): + """Base class for Information extractors.""" + + @property + @abstractmethod + def extractor_type(self) -> ExtractorTypes: ... + + @abstractmethod + async def aextract_content( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InternalInformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ diff --git a/extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py similarity index 78% rename from extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py rename to extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 0c3c4ce..8b54f1c 100644 --- a/extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -3,13 +3,14 @@ from abc import ABC, abstractmethod from pathlib import Path -from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.file_type import FileType -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.file_services.file_service import FileService -class InformationExtractor(ABC): - """Base class for Information extractors.""" +class InformationFileExtractor(ABC): + """Base class for Information file extractors.""" def __init__(self, file_service: FileService): """Initialize the InformationExtractor. @@ -34,7 +35,7 @@ def compatible_file_types(self) -> list[FileType]: """ @abstractmethod - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """ Extract content from given file. diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py new file mode 100644 index 0000000..b485c1e --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py @@ -0,0 +1,68 @@ +"""Module for the DefaultFileExtractor class.""" + +import logging +from typing import Optional + +from pydantic import StrictStr +from fastapi import UploadFile + +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.api_endpoints.extractor import Extractor +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + + +logger = logging.getLogger(__name__) + + +class DefaultExtractor(Extractor): + """A class to extract information from documents using available extractors. + + This class serves as a general extractor that utilizes a list of available + information extractors to extract content from documents. It determines the + appropriate extractor based on the file type of the document. + """ + + def __init__(self, available_extractors: list[InformationExtractor], mapper: Internal2ExternalInformationPiece): + """ + Initialize the GeneralExtractor. + + Parameters + ---------- + available_extractors : list of InformationExtractor + A list of available information extractors to be used by the GeneralExtractor. + """ + self._mapper = mapper + self._available_extractors = available_extractors + + async def aextract_information( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ + correct_extractors = [x for x in self._available_extractors if type == x.extractor_type] + if not correct_extractors: + raise ValueError(f"No extractor found for type {type}") + results = await correct_extractors[-1].aextract_content(type, name, file, kwargs) + return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py deleted file mode 100644 index 787997b..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Module for the DefaultFileExtractor class.""" - -import tempfile -from pathlib import Path - -from extractor_api_lib.api_endpoints.file_extractor import FileExtractor -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.impl.mapper.internal2external_information_piece import ( - Internal2ExternalInformationPiece, -) -from extractor_api_lib.models.extraction_request import ExtractionRequest -from extractor_api_lib.models.information_piece import InformationPiece - - -class DefaultFileExtractor(FileExtractor): - """Default implementation of the FileExtractor interface.""" - - def __init__( - self, - information_extractor: InformationExtractor, - file_service: FileService, - mapper: Internal2ExternalInformationPiece, - ): - """ - Initialize the DefaultFileExtractor. - - Parameters - ---------- - information_extractor : InformationExtractor - An instance of InformationExtractor to extract information from files. - file_service : FileService - An instance of FileService to handle file operations. - mapper : Internal2ExternalInformationPiece - An instance of Internal2ExternalInformationPiece to map internal information to external format. - """ - self.information_extractor = information_extractor - self.file_service = file_service - self.mapper = mapper - - async def aextract_information( - self, - extraction_request: ExtractionRequest, - ) -> list[InformationPiece]: - """ - Extract information from a document specified in the extraction request. - - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing details about the document to be extracted, including its path on S3. - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the document, where each piece contains non-null page content. - """ - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / extraction_request.path_on_s3 - - with open(temp_file_path, "wb") as temp_file: - self.file_service.download_file(extraction_request.path_on_s3, temp_file) - - results = self.information_extractor.extract_content(temp_file_path) - return [self.mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py deleted file mode 100644 index 05946bf..0000000 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/general_extractor.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Module for the GeneralExtractor class.""" - -from pathlib import Path - -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.impl.types.file_type import FileType -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece - - -class GeneralExtractor(InformationExtractor): - """A class to extract information from documents using available extractors. - - This class serves as a general extractor that utilizes a list of available - information extractors to extract content from documents. It determines the - appropriate extractor based on the file type of the document. - """ - - def __init__(self, file_service: FileService, available_extractors: list[InformationExtractor]): - """ - Initialize the GeneralExtractor. - - Parameters - ---------- - file_service : FileService - An instance of FileService to handle file operations. - available_extractors : list of InformationExtractor - A list of available information extractors to be used by the GeneralExtractor. - """ - super().__init__(file_service=file_service) - - self._available_extractors = available_extractors - - @property - def compatible_file_types(self) -> list[FileType]: - """ - List of compatible file types for the document parser. - - Returns - ------- - list[FileType] - A list containing the compatible file types. By default, it returns a list with FileType.NONE. - """ - return [FileType.NONE] - - def extract_content(self, file_path: Path) -> list[InformationPiece]: - """ - Extract content from given file. - - Parameters - ---------- - file_path : Path - Path to the file the information should be extracted from. - - Returns - ------- - list[InformationPiece] - The extracted information. - """ - file_type = str(file_path).split(".")[-1].upper() - correct_extractors = [ - x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] - ] - if not correct_extractors: - raise ValueError(f"No extractor found for file-ending {file_type}") - return correct_extractors[-1].extract_content(file_path) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index d4a3760..bfe9393 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,14 +1,16 @@ """Module for the implementation of the ExtractorApi interface.""" from dependency_injector.wiring import Provide, inject -from fastapi import Depends +from extractor_api_lib.api_endpoints.extractor import Extractor +from fastapi import Depends, UploadFile + +from pydantic import StrictStr +from typing import Optional +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair -from extractor_api_lib.api_endpoints.confluence_extractor import ConfluenceExtractor -from extractor_api_lib.api_endpoints.file_extractor import FileExtractor from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi from extractor_api_lib.dependency_container import DependencyContainer -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters -from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece @@ -16,13 +18,16 @@ class ExtractorApiImpl(BaseExtractorApi): """Implementation of the ExtractorApi interface.""" @inject - async def extract_from_file_post( + async def extract( self, - extraction_request: ExtractionRequest, - file_extractor: FileExtractor = Depends(Provide[DependencyContainer.file_extractor]), + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + extractor: Extractor = Depends(Provide[DependencyContainer.default_extractor]), ) -> list[InformationPiece]: """ - Extract information from a file based on the provided extraction request. + Extract information from a source. Parameters ---------- @@ -36,27 +41,4 @@ async def extract_from_file_post( list[InformationPiece] A list of extracted information pieces. """ - return await file_extractor.aextract_information(extraction_request) - - @inject - async def extract_from_confluence_post( - self, - confluence_parameters: ConfluenceParameters, - confluence_extractor: ConfluenceExtractor = Depends(Provide[DependencyContainer.confluence_extractor]), - ) -> list[InformationPiece]: - """ - Extract information from Confluence asynchronously. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - Parameters required to extract information from Confluence. - confluence_extractor : ConfluenceExtractor, optional - The Confluence extractor instance (default is provided by DependencyContainer). - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces from the configured Confluence space. - """ - return await confluence_extractor.aextract_from_confluence(confluence_parameters) + return await extractor.aextract_information(type, name, file, kwargs) diff --git a/extractor-api-lib/src/extractor_api_lib/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/__init__.py similarity index 100% rename from extractor-api-lib/src/extractor_api_lib/file_services/__init__.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py similarity index 58% rename from extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index b752f6c..1f7c666 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -1,20 +1,24 @@ """Module for the DefaultConfluenceExtractor class.""" +from typing import Optional + +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from pydantic import StrictStr from langchain_community.document_loaders import ConfluenceLoader +from fastapi import UploadFile -from extractor_api_lib.api_endpoints.confluence_extractor import ConfluenceExtractor +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.extractors.information_extractor import InformationExtractor from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, ) -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters -from extractor_api_lib.models.information_piece import InformationPiece -class DefaultConfluenceExtractor(ConfluenceExtractor): +class ConfluenceExtractor(InformationExtractor): """Default implementation of the FileExtractor interface.""" - MIN_PAGE_CONTENT_LENGTH = 10 - def __init__( self, mapper: ConfluenceLangchainDocument2InformationPiece, @@ -30,7 +34,17 @@ def __init__( """ self.mapper = mapper - async def aextract_from_confluence(self, confluence_parameters: ConfluenceParameters) -> list[InformationPiece]: + @property + def extractor_type(self) -> ExtractorTypes: + return ExtractorTypes.CONFLUENCE + + async def aextract_content( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[list[KeyValuePair]], + ) -> list[InternalInformationPiece]: """ Asynchronously extracts information pieces from Confluence. @@ -41,17 +55,14 @@ async def aextract_from_confluence(self, confluence_parameters: ConfluenceParame Returns ------- - list[InformationPiece] + list[InternalInformationPiece] A list of information pieces extracted from Confluence. """ - self.mapper.confluence_parameters = confluence_parameters - confluence_kwargs = {} - for ckwargs in confluence_parameters.confluence_kwargs: - confluence_kwargs[ckwargs.key] = ckwargs.value - confluence_loader_parameters = confluence_parameters.model_dump() - confluence_loader_parameters["confluence_kwargs"] = confluence_kwargs + # Convert list of key value pairs to dict + confluence_loader_parameters = {x.key: x.value for x in kwargs} # Drop the document_name parameter as it is not used by the ConfluenceLoader - confluence_loader_parameters.pop("document_name", None) + if "document_name" in confluence_loader_parameters: + confluence_loader_parameters.pop("document_name", None) document_loader = ConfluenceLoader(**confluence_loader_parameters) documents = document_loader.load() return [self.mapper.map_document2informationpiece(x) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py similarity index 100% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/__init__.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py similarity index 89% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index 8bb23ca..cb04681 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -6,22 +6,26 @@ from typing import Any, Optional import pandas as pd + from unstructured.documents.elements import Element from unstructured.partition.docx import partition_docx from unstructured.partition.pptx import partition_pptx -from extractor_api_lib.document_parser.information_extractor import InformationExtractor + + from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter logger = logging.getLogger(__name__) -class MSDocsExtractor(InformationExtractor): +class MSDocsExtractor(InformationFileExtractor): """Extractor for Microsoft Documents (DOCX and PPTX) using unstructured library.""" def __init__(self, file_service: FileService, dataframe_converter: DataframeConverter): @@ -50,7 +54,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.DOCX, FileType.PPTX] - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """ Extract content from a given file based on its extension. @@ -92,8 +96,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: return self._process_elements(elements, file_path.name) - def _process_elements(self, elements: list[Element], document_name: str) -> list[InformationPiece]: - processed_elements: list[InformationPiece] = [] + def _process_elements(self, elements: list[Element], document_name: str) -> list[InternalInformationPiece]: + processed_elements: list[InternalInformationPiece] = [] page_content_lines: list[tuple[str, str]] = [] current_page: int = 1 old_page: int = 1 @@ -118,7 +122,7 @@ def _process_element( self, el: Element, page_content_lines: list[tuple[str, str]], - processed_elements: list[InformationPiece], + processed_elements: list[InternalInformationPiece], document_name: str, current_page: int, ) -> None: @@ -154,7 +158,7 @@ def _process_table(self, el: Element, page_content_lines: list[tuple[str, str]]) def _create_text_piece( self, document_name: str, page: int, page_content_lines: list[tuple[str, str]] - ) -> InformationPiece: + ) -> InternalInformationPiece: content = "\n".join([content for _, content in page_content_lines]) return self._create_information_piece(document_name, page, content, ContentType.TEXT) @@ -165,7 +169,7 @@ def _create_information_piece( content: str, content_type: ContentType, additional_meta: Optional[dict[str, Any]] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "page": page, @@ -174,7 +178,7 @@ def _create_information_piece( } if additional_meta: metadata.update(additional_meta) - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py similarity index 94% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index beaee14..01eb6bf 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -14,19 +14,21 @@ from pdf2image import convert_from_path from pdfplumber.page import Page -from extractor_api_lib.document_parser.information_extractor import InformationExtractor -from extractor_api_lib.file_services.file_service import FileService + from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor logger = logging.getLogger(__name__) -class PDFExtractor(InformationExtractor): +class PDFExtractor(InformationFileExtractor): """PDFExtractor is a class responsible for extracting information from PDF files. It converts PDF pages to images, identifies table/figure coordinates, and extracts @@ -86,7 +88,7 @@ def _create_information_piece( content_type: ContentType, information_id: str, additional_meta: Optional[dict] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "page": page, @@ -96,13 +98,13 @@ def _create_information_piece( } if additional_meta: metadata = metadata | additional_meta - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, ) - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """Extract content from given file. Parameters @@ -147,7 +149,7 @@ def _extract_tabluar_data( document_name: str, text_x_tolerance: int = 1, text_y_tolerance: int = 1, - ) -> list[InformationPiece]: + ) -> list[InternalInformationPiece]: return_value = [] pdfplumber_tables = page.find_tables() table_strings = [] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py similarity index 83% rename from extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index 3478cab..2a9d21c 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/document_parser/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -5,20 +5,22 @@ from pathlib import Path from typing import Any, Optional + from unstructured.documents.elements import Element from unstructured.partition.xml import partition_xml -from extractor_api_lib.document_parser.information_extractor import InformationExtractor from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.information_piece import InformationPiece +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) -class XMLExtractor(InformationExtractor): +class XMLExtractor(InformationFileExtractor): """Extractor for XML documents using unstructured library.""" def __init__(self, file_service: FileService): @@ -43,7 +45,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.XML] - def extract_content(self, file_path: Path) -> list[InformationPiece]: + async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: """ Extract content from an XML file and processes the elements. @@ -60,8 +62,8 @@ def extract_content(self, file_path: Path) -> list[InformationPiece]: elements = partition_xml(filename=file_path.as_posix(), xml_keep_tags=False) return self._process_elements(elements, file_path.name) - def _process_elements(self, elements: list[Element], document_name: str) -> list[InformationPiece]: - processed_elements: list[InformationPiece] = [] + def _process_elements(self, elements: list[Element], document_name: str) -> list[InternalInformationPiece]: + processed_elements: list[InternalInformationPiece] = [] content_lines: list[tuple[str, str]] = [] for el in elements: @@ -86,7 +88,7 @@ def _sanitize_text(self, text: str) -> str: text = re.sub(r"\s+", " ", text) return text.strip() - def _create_text_piece(self, document_name: str, content_lines: list[tuple[str, str]]) -> InformationPiece: + def _create_text_piece(self, document_name: str, content_lines: list[tuple[str, str]]) -> InternalInformationPiece: content = "\n".join([content for _, content in content_lines]) return self._create_information_piece(document_name, content, ContentType.TEXT) @@ -96,7 +98,7 @@ def _create_information_piece( content: str, content_type: ContentType, additional_meta: Optional[dict[str, Any]] = None, - ) -> InformationPiece: + ) -> InternalInformationPiece: metadata = { "document": document_name, "id": hash_datetime(), @@ -104,7 +106,7 @@ def _create_information_piece( } if additional_meta: metadata.update(additional_meta) - return InformationPiece( + return InternalInformationPiece( type=content_type, metadata=metadata, page_content=content, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py new file mode 100644 index 0000000..dfb7031 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py @@ -0,0 +1,90 @@ +"""Module for the GeneralExtractor class.""" + +import logging +from pathlib import Path +import tempfile +import traceback +from typing import Any, List, Optional + + +from pydantic import StrictStr +from fastapi import UploadFile + +from extractor_api_lib.file_services.file_service import FileService +from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.information_piece import InformationPiece +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece + +logger = logging.getLogger(__name__) + + +class GeneralFileExtractor(InformationExtractor): + """A class to extract information from documents using available extractors. + + This class serves as a general extractor that utilizes a list of available + information extractors to extract content from documents. It determines the + appropriate extractor based on the file type of the document. + """ + + def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor]): + """ + Initialize the GeneralExtractor. + + Parameters + ---------- + file_service : FileService + An instance of FileService to handle file operations. + available_extractors : list of InformationExtractor + A list of available information extractors to be used by the GeneralExtractor. + """ + self._file_service=file_service + self._available_extractors = available_extractors + + @property + def extractor_type(self) -> ExtractorTypes: + return ExtractorTypes.FILE + + async def aextract_content( + self, + type: StrictStr, + name: StrictStr, + file: Optional[UploadFile], + kwargs: Optional[List[KeyValuePair]], + ) -> list[InternalInformationPiece]: + """ + Extract content from given file. + + Parameters + ---------- + file_path : Path + Path to the file the information should be extracted from. + + Returns + ------- + list[InformationPiece] + The extracted information. + """ + # save file on s3 + content = await file.read() + filename = file.filename + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / filename + with open(temp_file_path, "wb") as temp_file: + logger.debug("Temporary file created at %s.", temp_file_path) + temp_file.write(content) + logger.debug("Temp file created and content written.") + self._file_service.upload_file(temp_file_path, filename) + file_type = str(temp_file_path).split(".")[-1].upper() + correct_extractors = [ + x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] + ] + if not correct_extractors: + raise ValueError(f"No extractor found for file-ending {file_type}") + return await correct_extractors[-1].aextract_content(temp_file_path) + except Exception as e: + logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) + raise e diff --git a/extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py b/extractor-api-lib/src/extractor_api_lib/impl/file_services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index a4da430..11f57b4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -2,8 +2,8 @@ from extractor_api_lib.impl.types.content_type import ContentType as InternalContentType from extractor_api_lib.models.content_type import ContentType as ExternalContentType -from extractor_api_lib.models.dataclasses.information_piece import ( - InformationPiece as InternalInformationPiece, +from extractor_api_lib.models.dataclasses.internal_information_piece import ( + InternalInformationPiece as InternalInformationPiece, ) from extractor_api_lib.models.information_piece import ( InformationPiece as ExternalInformationPiece, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py new file mode 100644 index 0000000..8a9a403 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py @@ -0,0 +1,9 @@ +from enum import StrEnum + + +class ExtractorTypes(StrEnum): + """Enum describing the type of information source.""" + + FILE = "file" + CONFLUENCE = "confluence" + NONE = "None" diff --git a/extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py similarity index 92% rename from extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py rename to extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py index 7bd609a..f0699e4 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/dataclasses/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/dataclasses/internal_information_piece.py @@ -6,7 +6,7 @@ @dataclasses.dataclass -class InformationPiece: +class InternalInformationPiece: """Dataclass holding the information found in a document.""" type: ContentType # noqa: A003 # type of the information From b32d7c3e56302faa88115b96c511d678177b51b8 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Wed, 14 May 2025 15:39:00 +0200 Subject: [PATCH 05/56] it works --- admin-api-lib/poetry.lock | 2 +- admin-api-lib/pyproject.toml | 1 + .../api_endpoints/confluence_loader.py | 19 - .../api_endpoints/document_uploader.py | 26 - .../src/admin_api_lib/apis/admin_api.py | 15 +- .../src/admin_api_lib/apis/admin_api_base.py | 1 + .../src/admin_api_lib/dependency_container.py | 29 +- .../extractor_api_client.py | 50 ++ .../{openapi_client => }/models/__init__.py | 6 +- .../models/content_type.py | 0 .../models/information_piece.py | 4 +- .../models/key_value_pair.py | 0 .../openapi_client/__init__.py | 36 - .../openapi_client/api/__init__.py | 4 - .../openapi_client/api/extractor_api.py | 323 -------- .../openapi_client/api_client.py | 695 ------------------ .../openapi_client/api_response.py | 20 - .../openapi_client/configuration.py | 460 ------------ .../openapi_client/exceptions.py | 197 ----- .../models/confluence_parameters.py | 137 ---- .../models/extraction_request.py | 101 --- .../openapi_client/rest.py | 209 ------ .../openapi_client/test/__init__.py | 0 .../openapi_client/test/test_content_type.py | 35 - .../test/test_extraction_request.py | 58 -- .../openapi_client/test/test_extractor_api.py | 35 - .../test/test_information_piece.py | 62 -- .../test/test_key_value_pair.py | 54 -- .../admin_api_lib/file_services/__init__.py | 0 .../src/admin_api_lib/impl/admin_api.py | 5 +- .../default_confluence_loader.py | 195 ----- .../default_document_uploader.py | 192 ----- .../api_endpoints/default_source_uploader.py | 35 +- .../impl/mapper/confluence_settings_mapper.py | 36 - .../impl/mapper/informationpiece2document.py | 4 +- .../impl/settings/confluence_settings.py | 170 ----- extractor-api-lib/openapi.yaml | 3 +- .../extractor_api_lib/apis/extractor_api.py | 2 +- .../extractors/information_file_extractor.py | 6 +- .../file_extractors/ms_docs_extractor.py | 6 +- .../file_extractors/pdf_extractor.py | 8 +- .../file_extractors/xml_extractor.py | 4 +- .../impl/extractors/general_file_extractor.py | 4 +- rag-core-api/src/rag_core_api/apis/rag_api.py | 5 + 44 files changed, 125 insertions(+), 3129 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py delete mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/__init__.py (54%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/content_type.py (100%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/information_piece.py (94%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{openapi_client => }/models/key_value_pair.py (100%) delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py delete mode 100644 admin-api-lib/src/admin_api_lib/file_services/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py diff --git a/admin-api-lib/poetry.lock b/admin-api-lib/poetry.lock index 671adcc..bd12f09 100644 --- a/admin-api-lib/poetry.lock +++ b/admin-api-lib/poetry.lock @@ -3693,4 +3693,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "99eff6a6ab91512602e8e3094b71bdba096ccf58746d47afd92dff99b24da487" +content-hash = "f34effb5fa2b12b05da69ca28c62764dc2017a2a2a9336b5265428005004e7ec" diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index ec0de57..d7a995f 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -107,6 +107,7 @@ langfuse = "^2.60.4" redis = "^6.0.0" pyyaml = "^6.0.2" python-multipart = "^0.0.20" +requests-toolbelt = "^1.0.0" [tool.pytest.ini_options] log_cli = 1 diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py deleted file mode 100644 index 06d79be..0000000 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Module for ConfluenceLoader abstract base class.""" - -from abc import ABC, abstractmethod - - -class ConfluenceLoader(ABC): - """Abstract base class for the confluence loader endpoint.""" - - @abstractmethod - async def aload_from_confluence(self) -> None: - """ - Load data from Confluence asynchronously. - - This method should be implemented to load data asynchronously from Confluence. - - Returns - ------- - None - """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py deleted file mode 100644 index 9a3e70b..0000000 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Module for the DocumentUploader abstract base class.""" - -from abc import ABC, abstractmethod - -from fastapi import Request, UploadFile - - -class DocumentUploader(ABC): - """Abstract base class for document upload endpoint.""" - - @abstractmethod - async def aupload_documents_post(self, body: UploadFile, request: Request) -> None: - """ - Upload documents asynchronously, currently supported formats are: PDF, DOCX, XML, PPTX. - - Parameters - ---------- - body : UploadFile - The uploaded file. - request : Request - The request object. - - Returns - ------- - None - """ diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index ccaed84..9d32286 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,11 +1,11 @@ # coding: utf-8 -from typing import Dict, List # noqa: F401 +from typing import Dict, List, Annotated # noqa: F401 import importlib import pkgutil from admin_api_lib.apis.admin_api_base import BaseAdminApi -from fastapi import APIRouter, Path, Request, Response, UploadFile # noqa: F401 +from fastapi import APIRouter, Path, Request, Response, UploadFile, Form # noqa: F401 import admin_api_lib.impl @@ -135,12 +135,13 @@ async def get_all_documents_status() -> List[DocumentStatus]: response_model_by_alias=True, ) async def upload_source( - type: StrictStr = Form(None, description=""), - name: StrictStr = Form(None, description=""), - file: Optional[UploadFile] = Form(None, description=""), - kwargs: Optional[List[KeyValuePair]] = Form(None, description=""), + request: Request, + type: Annotated[str, Form()], + name: Annotated[str, Form()], + file: Optional[UploadFile] = None, + kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, ) -> None: """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs, request) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 48e22dc..8aebb8b 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -70,6 +70,7 @@ async def upload_source( name: StrictStr, file: Optional[UploadFile], kwargs: Optional[List[KeyValuePair]], + request: Request, ) -> None: """Uploads user selected sources.""" ... diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 4ca3b57..93b3ab2 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,6 +1,5 @@ """Module for the DependencyContainer class.""" -from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -12,25 +11,15 @@ from langchain_community.llms import Ollama, VLLMOpenAI from langfuse import Langfuse -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient -from admin_api_lib.extractor_api_client.openapi_client.configuration import ( - Configuration as ExtractorConfiguration, -) -from admin_api_lib.impl.api_endpoints.default_confluence_loader import ( - DefaultConfluenceLoader, -) +from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.impl.api_endpoints.default_document_deleter import ( DefaultDocumentDeleter, ) from admin_api_lib.impl.api_endpoints.default_document_reference_retriever import ( DefaultDocumentReferenceRetriever, ) -from admin_api_lib.impl.api_endpoints.default_document_uploader import ( - DefaultDocumentUploader, -) + from admin_api_lib.impl.api_endpoints.default_documents_status_retriever import ( DefaultDocumentsStatusRetriever, ) @@ -43,14 +32,10 @@ from admin_api_lib.impl.key_db.file_status_key_value_store import ( FileStatusKeyValueStore, ) -from admin_api_lib.impl.mapper.confluence_settings_mapper import ( - ConfluenceSettingsMapper, -) from admin_api_lib.impl.mapper.informationpiece2document import ( InformationPiece2Document, ) from admin_api_lib.impl.settings.chunker_settings import ChunkerSettings -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings from admin_api_lib.impl.settings.document_extractor_settings import ( DocumentExtractorSettings, ) @@ -93,7 +78,6 @@ class DependencyContainer(DeclarativeContainer): rag_api_settings = RAGAPISettings() key_value_store_settings = KeyValueSettings() summarizer_settings = SummarizerSettings() - confluence_settings = ConfluenceSettings() key_value_store = Singleton(FileStatusKeyValueStore, key_value_store_settings) file_service = Singleton(S3Service, s3_settings=s3_settings) @@ -103,16 +87,13 @@ class DependencyContainer(DeclarativeContainer): ) chunker = Singleton(TextChunker, text_splitter) - extractor_api_configuration = Singleton(ExtractorConfiguration, host=document_extractor_settings.host) - document_extractor_api_client = Singleton(ApiClient, extractor_api_configuration) - document_extractor = Singleton(ExtractorApi, document_extractor_api_client) + document_extractor = Singleton(ExtractorApiClient, document_extractor_settings.host) rag_api_configuration = Singleton(RagConfiguration, host=rag_api_settings.host) rag_api_client = Singleton(RagApiClient, configuration=rag_api_configuration) rag_api = Singleton(RagApi, rag_api_client) information_mapper = Singleton(InformationPiece2Document) - confluence_settings_mapper = Singleton(ConfluenceSettingsMapper) large_language_model = Selector( class_selector_config.llm_type, @@ -165,7 +146,7 @@ class DependencyContainer(DeclarativeContainer): DefaultDocumentDeleter, rag_api=rag_api, file_service=file_service, key_value_store=key_value_store ) documents_status_retriever = Singleton(DefaultDocumentsStatusRetriever, key_value_store=key_value_store) - + document_reference_retriever = Singleton(DefaultDocumentReferenceRetriever, file_service=file_service) source_uploader = Singleton( diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py new file mode 100644 index 0000000..78ccbf7 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py @@ -0,0 +1,50 @@ +import requests +from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece +from requests_toolbelt.multipart import MultipartEncoder + + +class ExtractorApiClient: + def __init__(self, base_url): + """ + Initialize the client with the base URL of the API. + + Args: + base_url (str): The base URL of the API. + """ + self.base_url = base_url + + def extract(self, type, name, file, kwargs=None): + """ + Send an extraction request to the API. + + Args: + file (str): The path to the file to extract from. + name (str): The name of the extraction request. + type (str): The type of extraction to perform. + kwargs (list): A list of key-value pairs to pass as additional arguments. + + Returns: + list: A list of extracted information pieces. + """ + with open(file, "rb") as openfile: + url = self.base_url + "/extract" + encoder = MultipartEncoder( + fields={ + "file": (file, openfile, "application/octet-stream"), + "name": name, + "type": type, + } + ) + if kwargs: + for pair in kwargs: + encoder.add_field(pair["key"], pair["value"]) + response = requests.post(url, headers={"Content-Type": encoder.content_type}, data=encoder) + if response.status_code == 200: + response_json = response.json() + return [InformationPiece.from_dict(x) for x in response_json] + elif response.status_code == 422: + raise ValueError("Invalid source") + elif response.status_code == 500: + raise Exception("Internal server error") + else: + raise Exception("Unknown error") diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py similarity index 54% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py index e0ef19f..53560b6 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py @@ -14,6 +14,6 @@ # import models into model package -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py similarity index 94% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py index a428183..99c3ee2 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py @@ -19,8 +19,8 @@ from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair from typing import Optional, Set from typing_extensions import Self diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py deleted file mode 100644 index f43e4e9..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding: utf-8 - -# flake8: noqa - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -__version__ = "1.0.0" - -# import apis into sdk package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi - -# import ApiClient -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient -from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration -from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException - -# import models into sdk package -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py deleted file mode 100644 index c95ce65..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa - -# import apis into api package -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py deleted file mode 100644 index 1a862d3..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +++ /dev/null @@ -1,323 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - -import warnings -from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt -from typing import Any, Dict, List, Optional, Tuple, Union -from typing_extensions import Annotated - -from pydantic import StrictBytes, StrictStr -from typing import List, Optional, Tuple, Union -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair - -from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse -from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType - - -class ExtractorApi: - """NOTE: This class is auto generated by OpenAPI Generator - Ref: https://openapi-generator.tech - - Do not edit the class manually. - """ - - def __init__(self, api_client=None) -> None: - if api_client is None: - api_client = ApiClient.get_default() - self.api_client = api_client - - @validate_call - def extract( - self, - type: StrictStr, - name: StrictStr, - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, - kwargs: Optional[List[KeyValuePair]] = None, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> List[InformationPiece]: - """extract - - - :param type: (required) - :type type: str - :param name: (required) - :type name: str - :param file: - :type file: bytearray - :param kwargs: - :type kwargs: List[KeyValuePair] - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_serialize( - type=type, - name=name, - file=file, - kwargs=kwargs, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ).data - - @validate_call - def extract_with_http_info( - self, - type: StrictStr, - name: StrictStr, - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, - kwargs: Optional[List[KeyValuePair]] = None, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> ApiResponse[List[InformationPiece]]: - """extract - - - :param type: (required) - :type type: str - :param name: (required) - :type name: str - :param file: - :type file: bytearray - :param kwargs: - :type kwargs: List[KeyValuePair] - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_serialize( - type=type, - name=name, - file=file, - kwargs=kwargs, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - response_data.read() - return self.api_client.response_deserialize( - response_data=response_data, - response_types_map=_response_types_map, - ) - - @validate_call - def extract_without_preload_content( - self, - type: StrictStr, - name: StrictStr, - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None, - kwargs: Optional[List[KeyValuePair]] = None, - _request_timeout: Union[ - None, - Annotated[StrictFloat, Field(gt=0)], - Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], - ] = None, - _request_auth: Optional[Dict[StrictStr, Any]] = None, - _content_type: Optional[StrictStr] = None, - _headers: Optional[Dict[StrictStr, Any]] = None, - _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, - ) -> RESTResponseType: - """extract - - - :param type: (required) - :type type: str - :param name: (required) - :type name: str - :param file: - :type file: bytearray - :param kwargs: - :type kwargs: List[KeyValuePair] - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - :type _request_timeout: int, tuple(int, int), optional - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the - authentication in the spec for a single request. - :type _request_auth: dict, optional - :param _content_type: force content-type for the request. - :type _content_type: str, Optional - :param _headers: set to override the headers for a single - request; this effectively ignores the headers - in the spec for a single request. - :type _headers: dict, optional - :param _host_index: set to override the host_index for a single - request; this effectively ignores the host_index - in the spec for a single request. - :type _host_index: int, optional - :return: Returns the result object. - """ # noqa: E501 - - _param = self._extract_serialize( - type=type, - name=name, - file=file, - kwargs=kwargs, - _request_auth=_request_auth, - _content_type=_content_type, - _headers=_headers, - _host_index=_host_index, - ) - - _response_types_map: Dict[str, Optional[str]] = { - "200": "List[InformationPiece]", - "422": None, - "500": None, - } - response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) - return response_data.response - - def _extract_serialize( - self, - type, - name, - file, - kwargs, - _request_auth, - _content_type, - _headers, - _host_index, - ) -> RequestSerialized: - - _host = None - - _collection_formats: Dict[str, str] = { - "kwargs": "csv", - } - - _path_params: Dict[str, str] = {} - _query_params: List[Tuple[str, str]] = [] - _header_params: Dict[str, Optional[str]] = _headers or {} - _form_params: List[Tuple[str, str]] = [] - _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} - _body_params: Optional[bytes] = None - - # process the path parameters - # process the query parameters - # process the header parameters - # process the form parameters - if file is not None: - _files["file"] = file - if type is not None: - _form_params.append(("type", type)) - if kwargs is not None: - _form_params.append(("kwargs", kwargs)) - if name is not None: - _form_params.append(("name", name)) - # process the body parameter - - # set the HTTP header `Accept` - if "Accept" not in _header_params: - _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) - - # set the HTTP header `Content-Type` - if _content_type: - _header_params["Content-Type"] = _content_type - else: - _default_content_type = self.api_client.select_header_content_type(["multipart/form-data"]) - if _default_content_type is not None: - _header_params["Content-Type"] = _default_content_type - - # authentication setting - _auth_settings: List[str] = [] - - return self.api_client.param_serialize( - method="POST", - resource_path="/extract", - path_params=_path_params, - query_params=_query_params, - header_params=_header_params, - body=_body_params, - post_params=_form_params, - files=_files, - auth_settings=_auth_settings, - collection_formats=_collection_formats, - _host=_host, - _request_auth=_request_auth, - ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py deleted file mode 100644 index ba8f5d2..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +++ /dev/null @@ -1,695 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import datetime -from dateutil.parser import parse -from enum import Enum -import decimal -import json -import mimetypes -import os -import re -import tempfile - -from urllib.parse import quote -from typing import Tuple, Optional, List, Dict, Union -from pydantic import SecretStr - -from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration -from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT -import admin_api_lib.extractor_api_client.openapi_client.models -from admin_api_lib.extractor_api_client.openapi_client import rest -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( - ApiValueError, - ApiException, - BadRequestException, - UnauthorizedException, - ForbiddenException, - NotFoundException, - ServiceException, -) - -RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] - - -class ApiClient: - """Generic API client for OpenAPI client library builds. - - OpenAPI generic API client. This client handles the client- - server communication, and is invariant across implementations. Specifics of - the methods and models for each application are generated from the OpenAPI - templates. - - :param configuration: .Configuration object for this client - :param header_name: a header to pass when making calls to the API. - :param header_value: a header value to pass when making calls to - the API. - :param cookie: a cookie to include in the header when making calls - to the API - """ - - PRIMITIVE_TYPES = (float, bool, bytes, str, int) - NATIVE_TYPES_MAPPING = { - "int": int, - "long": int, # TODO remove as only py3 is supported? - "float": float, - "str": str, - "bool": bool, - "date": datetime.date, - "datetime": datetime.datetime, - "decimal": decimal.Decimal, - "object": object, - } - _pool = None - - def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: - # use default configuration if none is provided - if configuration is None: - configuration = Configuration.get_default() - self.configuration = configuration - - self.rest_client = rest.RESTClientObject(configuration) - self.default_headers = {} - if header_name is not None: - self.default_headers[header_name] = header_value - self.cookie = cookie - # Set default User-Agent. - self.user_agent = "OpenAPI-Generator/1.0.0/python" - self.client_side_validation = configuration.client_side_validation - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - pass - - @property - def user_agent(self): - """User agent for this API client""" - return self.default_headers["User-Agent"] - - @user_agent.setter - def user_agent(self, value): - self.default_headers["User-Agent"] = value - - def set_default_header(self, header_name, header_value): - self.default_headers[header_name] = header_value - - _default = None - - @classmethod - def get_default(cls): - """Return new instance of ApiClient. - - This method returns newly created, based on default constructor, - object of ApiClient class or returns a copy of default - ApiClient. - - :return: The ApiClient object. - """ - if cls._default is None: - cls._default = ApiClient() - return cls._default - - @classmethod - def set_default(cls, default): - """Set default instance of ApiClient. - - It stores default ApiClient. - - :param default: object of ApiClient. - """ - cls._default = default - - def param_serialize( - self, - method, - resource_path, - path_params=None, - query_params=None, - header_params=None, - body=None, - post_params=None, - files=None, - auth_settings=None, - collection_formats=None, - _host=None, - _request_auth=None, - ) -> RequestSerialized: - """Builds the HTTP request params needed by the request. - :param method: Method to call. - :param resource_path: Path to method endpoint. - :param path_params: Path parameters in the url. - :param query_params: Query parameters in the url. - :param header_params: Header parameters to be - placed in the request header. - :param body: Request body. - :param post_params dict: Request post form parameters, - for `application/x-www-form-urlencoded`, `multipart/form-data`. - :param auth_settings list: Auth Settings names for the request. - :param files dict: key -> filename, value -> filepath, - for `multipart/form-data`. - :param collection_formats: dict of collection formats for path, query, - header, and post parameters. - :param _request_auth: set to override the auth_settings for an a single - request; this effectively ignores the authentication - in the spec for a single request. - :return: tuple of form (path, http_method, query_params, header_params, - body, post_params, files) - """ - - config = self.configuration - - # header parameters - header_params = header_params or {} - header_params.update(self.default_headers) - if self.cookie: - header_params["Cookie"] = self.cookie - if header_params: - header_params = self.sanitize_for_serialization(header_params) - header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) - - # path parameters - if path_params: - path_params = self.sanitize_for_serialization(path_params) - path_params = self.parameters_to_tuples(path_params, collection_formats) - for k, v in path_params: - # specified safe chars, encode everything - resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) - - # post parameters - if post_params or files: - post_params = post_params if post_params else [] - post_params = self.sanitize_for_serialization(post_params) - post_params = self.parameters_to_tuples(post_params, collection_formats) - if files: - post_params.extend(self.files_parameters(files)) - - # auth setting - self.update_params_for_auth( - header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth - ) - - # body - if body: - body = self.sanitize_for_serialization(body) - - # request url - if _host is None or self.configuration.ignore_operation_servers: - url = self.configuration.host + resource_path - else: - # use server/host defined in path or operation instead - url = _host + resource_path - - # query parameters - if query_params: - query_params = self.sanitize_for_serialization(query_params) - url_query = self.parameters_to_url_query(query_params, collection_formats) - url += "?" + url_query - - return method, url, header_params, body, post_params - - def call_api( - self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None - ) -> rest.RESTResponse: - """Makes the HTTP request (synchronous) - :param method: Method to call. - :param url: Path to method endpoint. - :param header_params: Header parameters to be - placed in the request header. - :param body: Request body. - :param post_params dict: Request post form parameters, - for `application/x-www-form-urlencoded`, `multipart/form-data`. - :param _request_timeout: timeout setting for this request. - :return: RESTResponse - """ - - try: - # perform request and return response - response_data = self.rest_client.request( - method, - url, - headers=header_params, - body=body, - post_params=post_params, - _request_timeout=_request_timeout, - ) - - except ApiException as e: - raise e - - return response_data - - def response_deserialize( - self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None - ) -> ApiResponse[ApiResponseT]: - """Deserializes response into an object. - :param response_data: RESTResponse object to be deserialized. - :param response_types_map: dict of response types. - :return: ApiResponse - """ - - msg = "RESTResponse.read() must be called before passing it to response_deserialize()" - assert response_data.data is not None, msg - - response_type = response_types_map.get(str(response_data.status), None) - if not response_type and isinstance(response_data.status, int) and 100 <= response_data.status <= 599: - # if not found, look for '1XX', '2XX', etc. - response_type = response_types_map.get(str(response_data.status)[0] + "XX", None) - - # deserialize response data - response_text = None - return_data = None - try: - if response_type == "bytearray": - return_data = response_data.data - elif response_type == "file": - return_data = self.__deserialize_file(response_data) - elif response_type is not None: - match = None - content_type = response_data.getheader("content-type") - if content_type is not None: - match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) - encoding = match.group(1) if match else "utf-8" - response_text = response_data.data.decode(encoding) - return_data = self.deserialize(response_text, response_type, content_type) - finally: - if not 200 <= response_data.status <= 299: - raise ApiException.from_response( - http_resp=response_data, - body=response_text, - data=return_data, - ) - - return ApiResponse( - status_code=response_data.status, - data=return_data, - headers=response_data.getheaders(), - raw_data=response_data.data, - ) - - def sanitize_for_serialization(self, obj): - """Builds a JSON POST object. - - If obj is None, return None. - If obj is SecretStr, return obj.get_secret_value() - If obj is str, int, long, float, bool, return directly. - If obj is datetime.datetime, datetime.date - convert to string in iso8601 format. - If obj is decimal.Decimal return string representation. - If obj is list, sanitize each element in the list. - If obj is dict, return the dict. - If obj is OpenAPI model, return the properties dict. - - :param obj: The data to serialize. - :return: The serialized form of data. - """ - if obj is None: - return None - elif isinstance(obj, Enum): - return obj.value - elif isinstance(obj, SecretStr): - return obj.get_secret_value() - elif isinstance(obj, self.PRIMITIVE_TYPES): - return obj - elif isinstance(obj, list): - return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] - elif isinstance(obj, tuple): - return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) - elif isinstance(obj, (datetime.datetime, datetime.date)): - return obj.isoformat() - elif isinstance(obj, decimal.Decimal): - return str(obj) - - elif isinstance(obj, dict): - obj_dict = obj - else: - # Convert model obj to dict except - # attributes `openapi_types`, `attribute_map` - # and attributes which value is not None. - # Convert attribute name to json key in - # model definition for request. - if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): - obj_dict = obj.to_dict() - else: - obj_dict = obj.__dict__ - - return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} - - def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): - """Deserializes response into an object. - - :param response: RESTResponse object to be deserialized. - :param response_type: class literal for - deserialized object, or string of class name. - :param content_type: content type of response. - - :return: deserialized object. - """ - - # fetch data from response object - if content_type is None: - try: - data = json.loads(response_text) - except ValueError: - data = response_text - elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): - if response_text == "": - data = "" - else: - data = json.loads(response_text) - elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): - data = response_text - else: - raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) - - return self.__deserialize(data, response_type) - - def __deserialize(self, data, klass): - """Deserializes dict, list, str into an object. - - :param data: dict, list or str. - :param klass: class literal, or string of class name. - - :return: object. - """ - if data is None: - return None - - if isinstance(klass, str): - if klass.startswith("List["): - m = re.match(r"List\[(.*)]", klass) - assert m is not None, "Malformed List type definition" - sub_kls = m.group(1) - return [self.__deserialize(sub_data, sub_kls) for sub_data in data] - - if klass.startswith("Dict["): - m = re.match(r"Dict\[([^,]*), (.*)]", klass) - assert m is not None, "Malformed Dict type definition" - sub_kls = m.group(2) - return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} - - # convert str to class - if klass in self.NATIVE_TYPES_MAPPING: - klass = self.NATIVE_TYPES_MAPPING[klass] - else: - klass = getattr(admin_api_lib.extractor_api_client.openapi_client.models, klass) - - if klass in self.PRIMITIVE_TYPES: - return self.__deserialize_primitive(data, klass) - elif klass == object: - return self.__deserialize_object(data) - elif klass == datetime.date: - return self.__deserialize_date(data) - elif klass == datetime.datetime: - return self.__deserialize_datetime(data) - elif klass == decimal.Decimal: - return decimal.Decimal(data) - elif issubclass(klass, Enum): - return self.__deserialize_enum(data, klass) - else: - return self.__deserialize_model(data, klass) - - def parameters_to_tuples(self, params, collection_formats): - """Get parameters as list of tuples, formatting collections. - - :param params: Parameters as dict or list of two-tuples - :param dict collection_formats: Parameter collection formats - :return: Parameters as list of tuples, collections formatted - """ - new_params: List[Tuple[str, str]] = [] - if collection_formats is None: - collection_formats = {} - for k, v in params.items() if isinstance(params, dict) else params: - if k in collection_formats: - collection_format = collection_formats[k] - if collection_format == "multi": - new_params.extend((k, value) for value in v) - else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" - else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(str(value) for value in v))) - else: - new_params.append((k, v)) - return new_params - - def parameters_to_url_query(self, params, collection_formats): - """Get parameters as list of tuples, formatting collections. - - :param params: Parameters as dict or list of two-tuples - :param dict collection_formats: Parameter collection formats - :return: URL query string (e.g. a=Hello%20World&b=123) - """ - new_params: List[Tuple[str, str]] = [] - if collection_formats is None: - collection_formats = {} - for k, v in params.items() if isinstance(params, dict) else params: - if isinstance(v, bool): - v = str(v).lower() - if isinstance(v, (int, float)): - v = str(v) - if isinstance(v, dict): - v = json.dumps(v) - - if k in collection_formats: - collection_format = collection_formats[k] - if collection_format == "multi": - new_params.extend((k, str(value)) for value in v) - else: - if collection_format == "ssv": - delimiter = " " - elif collection_format == "tsv": - delimiter = "\t" - elif collection_format == "pipes": - delimiter = "|" - else: # csv is the default - delimiter = "," - new_params.append((k, delimiter.join(quote(str(value)) for value in v))) - else: - new_params.append((k, quote(str(v)))) - - return "&".join(["=".join(map(str, item)) for item in new_params]) - - def files_parameters( - self, - files: Dict[str, Union[str, bytes, List[str], List[bytes], Tuple[str, bytes]]], - ): - """Builds form parameters. - - :param files: File parameters. - :return: Form parameters with files. - """ - params = [] - for k, v in files.items(): - if isinstance(v, str): - with open(v, "rb") as f: - filename = os.path.basename(f.name) - filedata = f.read() - elif isinstance(v, bytes): - filename = k - filedata = v - elif isinstance(v, tuple): - filename, filedata = v - elif isinstance(v, list): - for file_param in v: - params.extend(self.files_parameters({k: file_param})) - continue - else: - raise ValueError("Unsupported file value") - mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" - params.append(tuple([k, tuple([filename, filedata, mimetype])])) - return params - - def select_header_accept(self, accepts: List[str]) -> Optional[str]: - """Returns `Accept` based on an array of accepts provided. - - :param accepts: List of headers. - :return: Accept (e.g. application/json). - """ - if not accepts: - return None - - for accept in accepts: - if re.search("json", accept, re.IGNORECASE): - return accept - - return accepts[0] - - def select_header_content_type(self, content_types): - """Returns `Content-Type` based on an array of content_types provided. - - :param content_types: List of content-types. - :return: Content-Type (e.g. application/json). - """ - if not content_types: - return None - - for content_type in content_types: - if re.search("json", content_type, re.IGNORECASE): - return content_type - - return content_types[0] - - def update_params_for_auth( - self, headers, queries, auth_settings, resource_path, method, body, request_auth=None - ) -> None: - """Updates header and query params based on authentication setting. - - :param headers: Header parameters dict to be updated. - :param queries: Query parameters tuple list to be updated. - :param auth_settings: Authentication setting identifiers list. - :resource_path: A string representation of the HTTP request resource path. - :method: A string representation of the HTTP request method. - :body: A object representing the body of the HTTP request. - The object type is the return value of sanitize_for_serialization(). - :param request_auth: if set, the provided settings will - override the token in the configuration. - """ - if not auth_settings: - return - - if request_auth: - self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) - else: - for auth in auth_settings: - auth_setting = self.configuration.auth_settings().get(auth) - if auth_setting: - self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) - - def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: - """Updates the request parameters based on a single auth_setting - - :param headers: Header parameters dict to be updated. - :param queries: Query parameters tuple list to be updated. - :resource_path: A string representation of the HTTP request resource path. - :method: A string representation of the HTTP request method. - :body: A object representing the body of the HTTP request. - The object type is the return value of sanitize_for_serialization(). - :param auth_setting: auth settings for the endpoint - """ - if auth_setting["in"] == "cookie": - headers["Cookie"] = auth_setting["value"] - elif auth_setting["in"] == "header": - if auth_setting["type"] != "http-signature": - headers[auth_setting["key"]] = auth_setting["value"] - elif auth_setting["in"] == "query": - queries.append((auth_setting["key"], auth_setting["value"])) - else: - raise ApiValueError("Authentication token must be in `query` or `header`") - - def __deserialize_file(self, response): - """Deserializes body to file - - Saves response body into a file in a temporary folder, - using the filename from the `Content-Disposition` header if provided. - - handle file downloading - save response body into a tmp file and return the instance - - :param response: RESTResponse. - :return: file path. - """ - fd, path = tempfile.mkstemp(dir=self.configuration.temp_folder_path) - os.close(fd) - os.remove(path) - - content_disposition = response.getheader("Content-Disposition") - if content_disposition: - m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) - assert m is not None, "Unexpected 'content-disposition' header value" - filename = m.group(1) - path = os.path.join(os.path.dirname(path), filename) - - with open(path, "wb") as f: - f.write(response.data) - - return path - - def __deserialize_primitive(self, data, klass): - """Deserializes string to primitive type. - - :param data: str. - :param klass: class literal. - - :return: int, long, float, str, bool. - """ - try: - return klass(data) - except UnicodeEncodeError: - return str(data) - except TypeError: - return data - - def __deserialize_object(self, value): - """Return an original value. - - :return: object. - """ - return value - - def __deserialize_date(self, string): - """Deserializes string to date. - - :param string: str. - :return: date. - """ - try: - return parse(string).date() - except ImportError: - return string - except ValueError: - raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) - - def __deserialize_datetime(self, string): - """Deserializes string to datetime. - - The string should be in iso8601 datetime format. - - :param string: str. - :return: datetime. - """ - try: - return parse(string) - except ImportError: - return string - except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) - - def __deserialize_enum(self, data, klass): - """Deserializes primitive type to enum. - - :param data: primitive type. - :param klass: class literal. - :return: enum value. - """ - try: - return klass(data) - except ValueError: - raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) - - def __deserialize_model(self, data, klass): - """Deserializes list or dict to model. - - :param data: dict, list. - :param klass: class literal. - :return: model object. - """ - - return klass.from_dict(data) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py deleted file mode 100644 index 1ce1372..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +++ /dev/null @@ -1,20 +0,0 @@ -"""API response object.""" - -from __future__ import annotations -from typing import Optional, Generic, Mapping, TypeVar -from pydantic import Field, StrictInt, StrictBytes, BaseModel - -T = TypeVar("T") - - -class ApiResponse(BaseModel, Generic[T]): - """ - API response object - """ - - status_code: StrictInt = Field(description="HTTP status code") - headers: Optional[Mapping[str, str]] = Field(None, description="HTTP headers") - data: T = Field(description="Deserialized data given the data type") - raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") - - model_config = {"arbitrary_types_allowed": True} diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py deleted file mode 100644 index 2e80369..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +++ /dev/null @@ -1,460 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import copy -import logging -from logging import FileHandler -import multiprocessing -import sys -from typing import Optional -import urllib3 - -import http.client as httplib - -JSON_SCHEMA_VALIDATION_KEYWORDS = { - "multipleOf", - "maximum", - "exclusiveMaximum", - "minimum", - "exclusiveMinimum", - "maxLength", - "minLength", - "pattern", - "maxItems", - "minItems", -} - - -class Configuration: - """This class contains various settings of the API client. - - :param host: Base url. - :param ignore_operation_servers - Boolean to ignore operation servers for the API client. - Config will use `host` as the base url regardless of the operation servers. - :param api_key: Dict to store API key(s). - Each entry in the dict specifies an API key. - The dict key is the name of the security scheme in the OAS specification. - The dict value is the API key secret. - :param api_key_prefix: Dict to store API prefix (e.g. Bearer). - The dict key is the name of the security scheme in the OAS specification. - The dict value is an API key prefix when generating the auth data. - :param username: Username for HTTP basic authentication. - :param password: Password for HTTP basic authentication. - :param access_token: Access token. - :param server_index: Index to servers configuration. - :param server_variables: Mapping with string values to replace variables in - templated server configuration. The validation of enums is performed for - variables with defined enum values before. - :param server_operation_index: Mapping from operation ID to an index to server - configuration. - :param server_operation_variables: Mapping from operation ID to a mapping with - string values to replace variables in templated server configuration. - The validation of enums is performed for variables with defined enum - values before. - :param ssl_ca_cert: str - the path to a file of concatenated CA certificates - in PEM format. - :param retries: Number of retries for API requests. - - """ - - _default = None - - def __init__( - self, - host=None, - api_key=None, - api_key_prefix=None, - username=None, - password=None, - access_token=None, - server_index=None, - server_variables=None, - server_operation_index=None, - server_operation_variables=None, - ignore_operation_servers=False, - ssl_ca_cert=None, - retries=None, - *, - debug: Optional[bool] = None - ) -> None: - """Constructor""" - self._base_path = "http://localhost" if host is None else host - """Default Base url - """ - self.server_index = 0 if server_index is None and host is None else server_index - self.server_operation_index = server_operation_index or {} - """Default server index - """ - self.server_variables = server_variables or {} - self.server_operation_variables = server_operation_variables or {} - """Default server variables - """ - self.ignore_operation_servers = ignore_operation_servers - """Ignore operation servers - """ - self.temp_folder_path = None - """Temp file folder for downloading files - """ - # Authentication Settings - self.api_key = {} - if api_key: - self.api_key = api_key - """dict to store API key(s) - """ - self.api_key_prefix = {} - if api_key_prefix: - self.api_key_prefix = api_key_prefix - """dict to store API prefix (e.g. Bearer) - """ - self.refresh_api_key_hook = None - """function hook to refresh API key if expired - """ - self.username = username - """Username for HTTP basic authentication - """ - self.password = password - """Password for HTTP basic authentication - """ - self.access_token = access_token - """Access token - """ - self.logger = {} - """Logging Settings - """ - self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") - self.logger["urllib3_logger"] = logging.getLogger("urllib3") - self.logger_format = "%(asctime)s %(levelname)s %(message)s" - """Log format - """ - self.logger_stream_handler = None - """Log stream handler - """ - self.logger_file_handler: Optional[FileHandler] = None - """Log file handler - """ - self.logger_file = None - """Debug file location - """ - if debug is not None: - self.debug = debug - else: - self.__debug = False - """Debug switch - """ - - self.verify_ssl = True - """SSL/TLS verification - Set this to false to skip verifying SSL certificate when calling API - from https server. - """ - self.ssl_ca_cert = ssl_ca_cert - """Set this to customize the certificate file to verify the peer. - """ - self.cert_file = None - """client certificate file - """ - self.key_file = None - """client key file - """ - self.assert_hostname = None - """Set this to True/False to enable/disable SSL hostname verification. - """ - self.tls_server_name = None - """SSL/TLS Server Name Indication (SNI) - Set this to the SNI value expected by the server. - """ - - self.connection_pool_maxsize = multiprocessing.cpu_count() * 5 - """urllib3 connection pool's maximum number of connections saved - per pool. urllib3 uses 1 connection as default value, but this is - not the best value when you are making a lot of possibly parallel - requests to the same host, which is often the case here. - cpu_count * 5 is used as default value to increase performance. - """ - - self.proxy: Optional[str] = None - """Proxy URL - """ - self.proxy_headers = None - """Proxy headers - """ - self.safe_chars_for_path_param = "" - """Safe chars for path_param - """ - self.retries = retries - """Adding retries to override urllib3 default value 3 - """ - # Enable client side validation - self.client_side_validation = True - - self.socket_options = None - """Options to pass down to the underlying urllib3 socket - """ - - self.datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" - """datetime format - """ - - self.date_format = "%Y-%m-%d" - """date format - """ - - def __deepcopy__(self, memo): - cls = self.__class__ - result = cls.__new__(cls) - memo[id(self)] = result - for k, v in self.__dict__.items(): - if k not in ("logger", "logger_file_handler"): - setattr(result, k, copy.deepcopy(v, memo)) - # shallow copy of loggers - result.logger = copy.copy(self.logger) - # use setters to configure loggers - result.logger_file = self.logger_file - result.debug = self.debug - return result - - def __setattr__(self, name, value): - object.__setattr__(self, name, value) - - @classmethod - def set_default(cls, default): - """Set default instance of configuration. - - It stores default configuration, which can be - returned by get_default_copy method. - - :param default: object of Configuration - """ - cls._default = default - - @classmethod - def get_default_copy(cls): - """Deprecated. Please use `get_default` instead. - - Deprecated. Please use `get_default` instead. - - :return: The configuration object. - """ - return cls.get_default() - - @classmethod - def get_default(cls): - """Return the default configuration. - - This method returns newly created, based on default constructor, - object of Configuration class or returns a copy of default - configuration. - - :return: The configuration object. - """ - if cls._default is None: - cls._default = Configuration() - return cls._default - - @property - def logger_file(self): - """The logger file. - - If the logger_file is None, then add stream handler and remove file - handler. Otherwise, add file handler and remove stream handler. - - :param value: The logger_file path. - :type: str - """ - return self.__logger_file - - @logger_file.setter - def logger_file(self, value): - """The logger file. - - If the logger_file is None, then add stream handler and remove file - handler. Otherwise, add file handler and remove stream handler. - - :param value: The logger_file path. - :type: str - """ - self.__logger_file = value - if self.__logger_file: - # If set logging file, - # then add file handler and remove stream handler. - self.logger_file_handler = logging.FileHandler(self.__logger_file) - self.logger_file_handler.setFormatter(self.logger_formatter) - for _, logger in self.logger.items(): - logger.addHandler(self.logger_file_handler) - - @property - def debug(self): - """Debug status - - :param value: The debug status, True or False. - :type: bool - """ - return self.__debug - - @debug.setter - def debug(self, value): - """Debug status - - :param value: The debug status, True or False. - :type: bool - """ - self.__debug = value - if self.__debug: - # if debug status is True, turn on debug logging - for _, logger in self.logger.items(): - logger.setLevel(logging.DEBUG) - # turn on httplib debug - httplib.HTTPConnection.debuglevel = 1 - else: - # if debug status is False, turn off debug logging, - # setting log level to default `logging.WARNING` - for _, logger in self.logger.items(): - logger.setLevel(logging.WARNING) - # turn off httplib debug - httplib.HTTPConnection.debuglevel = 0 - - @property - def logger_format(self): - """The logger format. - - The logger_formatter will be updated when sets logger_format. - - :param value: The format string. - :type: str - """ - return self.__logger_format - - @logger_format.setter - def logger_format(self, value): - """The logger format. - - The logger_formatter will be updated when sets logger_format. - - :param value: The format string. - :type: str - """ - self.__logger_format = value - self.logger_formatter = logging.Formatter(self.__logger_format) - - def get_api_key_with_prefix(self, identifier, alias=None): - """Gets API key (with prefix if set). - - :param identifier: The identifier of apiKey. - :param alias: The alternative identifier of apiKey. - :return: The token for api key authentication. - """ - if self.refresh_api_key_hook is not None: - self.refresh_api_key_hook(self) - key = self.api_key.get(identifier, self.api_key.get(alias) if alias is not None else None) - if key: - prefix = self.api_key_prefix.get(identifier) - if prefix: - return "%s %s" % (prefix, key) - else: - return key - - def get_basic_auth_token(self): - """Gets HTTP basic authentication header (string). - - :return: The token for basic HTTP authentication. - """ - username = "" - if self.username is not None: - username = self.username - password = "" - if self.password is not None: - password = self.password - return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") - - def auth_settings(self): - """Gets Auth Settings dict for api client. - - :return: The Auth Settings information dict. - """ - auth = {} - return auth - - def to_debug_report(self): - """Gets the essential information for debugging. - - :return: The report for debugging. - """ - return ( - "Python SDK Debug Report:\n" - "OS: {env}\n" - "Python Version: {pyversion}\n" - "Version of the API: 1.0.0\n" - "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) - ) - - def get_host_settings(self): - """Gets an array of host settings - - :return: An array of host settings - """ - return [ - { - "url": "", - "description": "No description provided", - } - ] - - def get_host_from_settings(self, index, variables=None, servers=None): - """Gets host URL based on the index and variables - :param index: array index of the host settings - :param variables: hash of variable and the corresponding value - :param servers: an array of host settings or None - :return: URL based on host settings - """ - if index is None: - return self._base_path - - variables = {} if variables is None else variables - servers = self.get_host_settings() if servers is None else servers - - try: - server = servers[index] - except IndexError: - raise ValueError( - "Invalid index {0} when selecting the host settings. " - "Must be less than {1}".format(index, len(servers)) - ) - - url = server["url"] - - # go through variables and replace placeholders - for variable_name, variable in server.get("variables", {}).items(): - used_value = variables.get(variable_name, variable["default_value"]) - - if "enum_values" in variable and used_value not in variable["enum_values"]: - raise ValueError( - "The variable `{0}` in the host URL has invalid value " - "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) - ) - - url = url.replace("{" + variable_name + "}", used_value) - - return url - - @property - def host(self): - """Return generated host.""" - return self.get_host_from_settings(self.server_index, variables=self.server_variables) - - @host.setter - def host(self, value): - """Fix base path.""" - self._base_path = value - self.server_index = None diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py deleted file mode 100644 index 5dbd4b0..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +++ /dev/null @@ -1,197 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - -from typing import Any, Optional -from typing_extensions import Self - - -class OpenApiException(Exception): - """The base exception class for all OpenAPIExceptions""" - - -class ApiTypeError(OpenApiException, TypeError): - def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: - """Raises an exception for TypeErrors - - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (list): a list of keys an indices to get to the - current_item - None if unset - valid_classes (tuple): the primitive classes that current item - should be an instance of - None if unset - key_type (bool): False if our value is a value in a dict - True if it is a key in a dict - False if our item is an item in a list - None if unset - """ - self.path_to_item = path_to_item - self.valid_classes = valid_classes - self.key_type = key_type - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiTypeError, self).__init__(full_msg) - - -class ApiValueError(OpenApiException, ValueError): - def __init__(self, msg, path_to_item=None) -> None: - """ - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (list) the path to the exception in the - received_data dict. None if unset - """ - - self.path_to_item = path_to_item - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiValueError, self).__init__(full_msg) - - -class ApiAttributeError(OpenApiException, AttributeError): - def __init__(self, msg, path_to_item=None) -> None: - """ - Raised when an attribute reference or assignment fails. - - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (None/list) the path to the exception in the - received_data dict - """ - self.path_to_item = path_to_item - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiAttributeError, self).__init__(full_msg) - - -class ApiKeyError(OpenApiException, KeyError): - def __init__(self, msg, path_to_item=None) -> None: - """ - Args: - msg (str): the exception message - - Keyword Args: - path_to_item (None/list) the path to the exception in the - received_data dict - """ - self.path_to_item = path_to_item - full_msg = msg - if path_to_item: - full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) - super(ApiKeyError, self).__init__(full_msg) - - -class ApiException(OpenApiException): - - def __init__( - self, - status=None, - reason=None, - http_resp=None, - *, - body: Optional[str] = None, - data: Optional[Any] = None, - ) -> None: - self.status = status - self.reason = reason - self.body = body - self.data = data - self.headers = None - - if http_resp: - if self.status is None: - self.status = http_resp.status - if self.reason is None: - self.reason = http_resp.reason - if self.body is None: - try: - self.body = http_resp.data.decode("utf-8") - except Exception: - pass - self.headers = http_resp.getheaders() - - @classmethod - def from_response( - cls, - *, - http_resp, - body: Optional[str], - data: Optional[Any], - ) -> Self: - if http_resp.status == 400: - raise BadRequestException(http_resp=http_resp, body=body, data=data) - - if http_resp.status == 401: - raise UnauthorizedException(http_resp=http_resp, body=body, data=data) - - if http_resp.status == 403: - raise ForbiddenException(http_resp=http_resp, body=body, data=data) - - if http_resp.status == 404: - raise NotFoundException(http_resp=http_resp, body=body, data=data) - - if 500 <= http_resp.status <= 599: - raise ServiceException(http_resp=http_resp, body=body, data=data) - raise ApiException(http_resp=http_resp, body=body, data=data) - - def __str__(self): - """Custom error messages for exception""" - error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) - if self.headers: - error_message += "HTTP response headers: {0}\n".format(self.headers) - - if self.data or self.body: - error_message += "HTTP response body: {0}\n".format(self.data or self.body) - - return error_message - - -class BadRequestException(ApiException): - pass - - -class NotFoundException(ApiException): - pass - - -class UnauthorizedException(ApiException): - pass - - -class ForbiddenException(ApiException): - pass - - -class ServiceException(ApiException): - pass - - -def render_path(path_to_item): - """Returns a string representation of a path""" - result = "" - for pth in path_to_item: - if isinstance(pth, int): - result += "[{0}]".format(pth) - else: - result += "['{0}']".format(pth) - return result diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py deleted file mode 100644 index e24f0ad..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/confluence_parameters.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations - -import json -import pprint -import re # noqa: F401 -from typing import Any, ClassVar, Dict, List, Optional, Set - -from pydantic import BaseModel, ConfigDict, Field, StrictBool, StrictStr -from typing import Any, ClassVar, Dict, List, Optional -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair -from typing import Optional, Set -from typing_extensions import Self - - -class ConfluenceParameters(BaseModel): - """ """ # noqa: E501 - - url: StrictStr = Field(description="url of the confluence space.") - token: StrictStr = Field(description="api key to access confluence.") - space_key: StrictStr = Field(description="the space key of the confluence pages.") - include_attachments: Optional[StrictBool] = Field( - default=False, - description="whether to include file attachments (e.g., images, documents) in the parsed content. Default is `false`.", - ) - keep_markdown_format: Optional[StrictBool] = Field( - default=True, description="whether to preserve markdown formatting in the output. Default is `true`." - ) - keep_newlines: Optional[StrictBool] = Field( - default=True, - description="whether to retain newline characters in the output for better readability. Default is `true`.", - ) - document_name: StrictStr = Field( - description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." - ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field( - default=None, description="Additional kwargs like verify_ssl" - ) - __properties: ClassVar[List[str]] = [ - "url", - "token", - "space_key", - "include_attachments", - "keep_markdown_format", - "keep_newlines", - "document_name", - "confluence_kwargs", - ] - - model_config = ConfigDict( - populate_by_name=True, - validate_assignment=True, - protected_namespaces=(), - ) - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - return self.model_dump_json(by_alias=True, exclude_unset=True) - - @classmethod - def from_json(cls, json_str: str) -> Optional[Self]: - """Create an instance of ConfluenceParameters from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - excluded_fields: Set[str] = set([]) - - _dict = self.model_dump( - by_alias=True, - exclude=excluded_fields, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) - _items = [] - if self.confluence_kwargs: - for _item_confluence_kwargs in self.confluence_kwargs: - if _item_confluence_kwargs: - _items.append(_item_confluence_kwargs.to_dict()) - _dict["confluence_kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: - """Create an instance of ConfluenceParameters from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "url": obj.get("url"), - "token": obj.get("token"), - "space_key": obj.get("space_key"), - "include_attachments": ( - obj.get("include_attachments") if obj.get("include_attachments") is not None else False - ), - "keep_markdown_format": ( - obj.get("keep_markdown_format") if obj.get("keep_markdown_format") is not None else True - ), - "keep_newlines": obj.get("keep_newlines") if obj.get("keep_newlines") is not None else True, - "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] - if obj.get("confluence_kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py deleted file mode 100644 index 4f9f9af..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations -import pprint -import re # noqa: F401 -import json - -from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair -from typing import Optional, Set -from typing_extensions import Self - - -class ExtractionRequest(BaseModel): - """ """ # noqa: E501 - - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None - type: StrictStr - kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] - - model_config = ConfigDict( - populate_by_name=True, - validate_assignment=True, - protected_namespaces=(), - ) - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) - - @classmethod - def from_json(cls, json_str: str) -> Optional[Self]: - """Create an instance of ExtractionRequest from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - excluded_fields: Set[str] = set([]) - - _dict = self.model_dump( - by_alias=True, - exclude=excluded_fields, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) - _items = [] - if self.kwargs: - for _item_kwargs in self.kwargs: - if _item_kwargs: - _items.append(_item_kwargs.to_dict()) - _dict["kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: - """Create an instance of ExtractionRequest from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] - if obj.get("kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py deleted file mode 100644 index 60fc660..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +++ /dev/null @@ -1,209 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import io -import json -import re -import ssl - -import urllib3 - -from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError - -SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} -RESTResponseType = urllib3.HTTPResponse - - -def is_socks_proxy_url(url): - if url is None: - return False - split_section = url.split("://") - if len(split_section) < 2: - return False - else: - return split_section[0].lower() in SUPPORTED_SOCKS_PROXIES - - -class RESTResponse(io.IOBase): - - def __init__(self, resp) -> None: - self.response = resp - self.status = resp.status - self.reason = resp.reason - self.data = None - - def read(self): - if self.data is None: - self.data = self.response.data - return self.data - - def getheaders(self): - """Returns a dictionary of the response headers.""" - return self.response.headers - - def getheader(self, name, default=None): - """Returns a given response header.""" - return self.response.headers.get(name, default) - - -class RESTClientObject: - - def __init__(self, configuration) -> None: - # urllib3.PoolManager will pass all kw parameters to connectionpool - # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 - # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/connectionpool.py#L680 # noqa: E501 - # Custom SSL certificates and client certificates: http://urllib3.readthedocs.io/en/latest/advanced-usage.html # noqa: E501 - - # cert_reqs - if configuration.verify_ssl: - cert_reqs = ssl.CERT_REQUIRED - else: - cert_reqs = ssl.CERT_NONE - - pool_args = { - "cert_reqs": cert_reqs, - "ca_certs": configuration.ssl_ca_cert, - "cert_file": configuration.cert_file, - "key_file": configuration.key_file, - } - if configuration.assert_hostname is not None: - pool_args["assert_hostname"] = configuration.assert_hostname - - if configuration.retries is not None: - pool_args["retries"] = configuration.retries - - if configuration.tls_server_name: - pool_args["server_hostname"] = configuration.tls_server_name - - if configuration.socket_options is not None: - pool_args["socket_options"] = configuration.socket_options - - if configuration.connection_pool_maxsize is not None: - pool_args["maxsize"] = configuration.connection_pool_maxsize - - # https pool manager - self.pool_manager: urllib3.PoolManager - - if configuration.proxy: - if is_socks_proxy_url(configuration.proxy): - from urllib3.contrib.socks import SOCKSProxyManager - - pool_args["proxy_url"] = configuration.proxy - pool_args["headers"] = configuration.proxy_headers - self.pool_manager = SOCKSProxyManager(**pool_args) - else: - pool_args["proxy_url"] = configuration.proxy - pool_args["proxy_headers"] = configuration.proxy_headers - self.pool_manager = urllib3.ProxyManager(**pool_args) - else: - self.pool_manager = urllib3.PoolManager(**pool_args) - - def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): - """Perform requests. - - :param method: http request method - :param url: http request url - :param headers: http request headers - :param body: request json body, for `application/json` - :param post_params: request post parameters, - `application/x-www-form-urlencoded` - and `multipart/form-data` - :param _request_timeout: timeout setting for this request. If one - number provided, it will be total request - timeout. It can also be a pair (tuple) of - (connection, read) timeouts. - """ - method = method.upper() - assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] - - if post_params and body: - raise ApiValueError("body parameter cannot be used with post_params parameter.") - - post_params = post_params or {} - headers = headers or {} - - timeout = None - if _request_timeout: - if isinstance(_request_timeout, (int, float)): - timeout = urllib3.Timeout(total=_request_timeout) - elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: - timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) - - try: - # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` - if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: - - # no content type provided or payload is json - content_type = headers.get("Content-Type") - if not content_type or re.search("json", content_type, re.IGNORECASE): - request_body = None - if body is not None: - request_body = json.dumps(body) - r = self.pool_manager.request( - method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False - ) - elif content_type == "application/x-www-form-urlencoded": - r = self.pool_manager.request( - method, - url, - fields=post_params, - encode_multipart=False, - timeout=timeout, - headers=headers, - preload_content=False, - ) - elif content_type == "multipart/form-data": - # must del headers['Content-Type'], or the correct - # Content-Type which generated by urllib3 will be - # overwritten. - del headers["Content-Type"] - # Ensures that dict objects are serialized - post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] - r = self.pool_manager.request( - method, - url, - fields=post_params, - encode_multipart=True, - timeout=timeout, - headers=headers, - preload_content=False, - ) - # Pass a `string` parameter directly in the body to support - # other content types than JSON when `body` argument is - # provided in serialized form. - elif isinstance(body, str) or isinstance(body, bytes): - r = self.pool_manager.request( - method, url, body=body, timeout=timeout, headers=headers, preload_content=False - ) - elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): - request_body = "true" if body else "false" - r = self.pool_manager.request( - method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers - ) - else: - # Cannot generate the request from given parameters - msg = """Cannot prepare a request message for provided - arguments. Please check that your arguments match - declared content type.""" - raise ApiException(status=0, reason=msg) - # For `GET`, `HEAD` - else: - r = self.pool_manager.request( - method, url, fields={}, timeout=timeout, headers=headers, preload_content=False - ) - except urllib3.exceptions.SSLError as e: - msg = "\n".join([type(e).__name__, str(e)]) - raise ApiException(status=0, reason=msg) - - return RESTResponse(r) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py deleted file mode 100644 index 5a78d9b..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType - - -class TestContentType(unittest.TestCase): - """ContentType unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def testContentType(self): - """Test ContentType""" - # inst = ContentType() - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py deleted file mode 100644 index 2f8f1bd..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +++ /dev/null @@ -1,58 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest - - -class TestExtractionRequest(unittest.TestCase): - """ExtractionRequest unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def make_instance(self, include_optional) -> ExtractionRequest: - """Test ExtractionRequest - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included""" - # uncomment below to create an instance of `ExtractionRequest` - """ - model = ExtractionRequest() - if include_optional: - return ExtractionRequest( - file = bytes(b'blah'), - type = '', - kwargs = [ - {"value":"value","key":"key"} - ] - ) - else: - return ExtractionRequest( - type = '', - ) - """ - - def testExtractionRequest(self): - """Test ExtractionRequest""" - # inst_req_only = self.make_instance(include_optional=False) - # inst_req_and_optional = self.make_instance(include_optional=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py deleted file mode 100644 index f39a507..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi - - -class TestExtractorApi(unittest.TestCase): - """ExtractorApi unit test stubs""" - - def setUp(self) -> None: - self.api = ExtractorApi() - - def tearDown(self) -> None: - pass - - def test_extract_from_file_post(self) -> None: - """Test case for extract_from_file_post""" - pass - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py deleted file mode 100644 index 479c858..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece - - -class TestInformationPiece(unittest.TestCase): - """InformationPiece unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def make_instance(self, include_optional) -> InformationPiece: - """Test InformationPiece - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included""" - # uncomment below to create an instance of `InformationPiece` - """ - model = InformationPiece() - if include_optional: - return InformationPiece( - metadata = [ - {"value":"value","key":"key"} - ], - page_content = '', - type = 'IMAGE' - ) - else: - return InformationPiece( - metadata = [ - {"value":"value","key":"key"} - ], - page_content = '', - type = 'IMAGE', - ) - """ - - def testInformationPiece(self): - """Test InformationPiece""" - # inst_req_only = self.make_instance(include_optional=False) - # inst_req_and_optional = self.make_instance(include_optional=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py deleted file mode 100644 index 0ddc864..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +++ /dev/null @@ -1,54 +0,0 @@ -# coding: utf-8 - -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -import unittest - -from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair - - -class TestKeyValuePair(unittest.TestCase): - """KeyValuePair unit test stubs""" - - def setUp(self): - pass - - def tearDown(self): - pass - - def make_instance(self, include_optional) -> KeyValuePair: - """Test KeyValuePair - include_optional is a boolean, when False only required - params are included, when True both required and - optional params are included""" - # uncomment below to create an instance of `KeyValuePair` - """ - model = KeyValuePair() - if include_optional: - return KeyValuePair( - key = None, - value = None - ) - else: - return KeyValuePair( - ) - """ - - def testKeyValuePair(self): - """Test KeyValuePair""" - # inst_req_only = self.make_instance(include_optional=False) - # inst_req_and_optional = self.make_instance(include_optional=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/file_services/__init__.py b/admin-api-lib/src/admin_api_lib/file_services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 25745c5..dd39f3c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -1,9 +1,7 @@ """Module containing the implementation of the Admin API.""" -from dataclasses import Field import logging from typing import List, Optional -from typing_extensions import Annotated from pydantic import Field, StrictBytes, StrictStr from admin_api_lib.api_endpoints.source_uploader import SourceUploader @@ -12,12 +10,11 @@ from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile -from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader + from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter from admin_api_lib.api_endpoints.document_reference_retriever import ( DocumentReferenceRetriever, ) -from admin_api_lib.api_endpoints.document_uploader import DocumentUploader from admin_api_lib.api_endpoints.documents_status_retriever import ( DocumentsStatusRetriever, ) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py deleted file mode 100644 index 54fcfda..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Module for the DefaultConfluenceLoader class.""" - -import logging -from asyncio import run -from threading import Thread -import threading - -from fastapi import HTTPException, status -from langchain_core.documents import Document - -from admin_api_lib.api_endpoints.confluence_loader import ConfluenceLoader -from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.chunker.chunker import Chunker -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.impl.key_db.file_status_key_value_store import ( - FileStatusKeyValueStore, -) -from admin_api_lib.impl.mapper.confluence_settings_mapper import ( - ConfluenceSettingsMapper, -) -from admin_api_lib.impl.mapper.informationpiece2document import ( - InformationPiece2Document, -) -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings -from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer -from admin_api_lib.models.status import Status -from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi -from admin_api_lib.utils.utils import sanitize_document_name - -logger = logging.getLogger(__name__) - - -class DefaultConfluenceLoader(ConfluenceLoader): - """ - DefaultConfluenceLoader is responsible for loading content from Confluence asynchronously. - - Attributes - ---------- - CONFLUENCE_SPACE : str - The Confluence space key. - """ - - CONFLUENCE_SPACE = "confluence_space" - - def __init__( - self, - extractor_api: ExtractorApi, - settings: ConfluenceSettings, - information_mapper: InformationPiece2Document, - rag_api: RagApi, - key_value_store: FileStatusKeyValueStore, - information_enhancer: InformationEnhancer, - chunker: Chunker, - document_deleter: DocumentDeleter, - settings_mapper: ConfluenceSettingsMapper, - ): - """ - Initialize the DefaultConfluenceLoader with the provided dependencies. - - Parameters - ---------- - extractor_api : ExtractorApi - The API for extracting information. - settings : ConfluenceSettings - The settings for Confluence. - information_mapper : InformationPiece2Document - The mapper for information pieces to langchain documents. - rag_api : RagApi - The API client for interacting with the RAG backend system. - key_value_store : FileStatusKeyValueStore - The key-value store to store file names and the corresponding file statuses. - information_enhancer : InformationEnhancer - The enhancer for information pieces. - chunker : Chunker - The chunker for breaking down documents into chunks. - document_deleter : DocumentDeleter - The deleter for documents from S3 Storage and Vector Database. - settings_mapper : ConfluenceSettingsMapper - The mapper to map the Confluence settings to confluence parameters. - """ - self._extractor_api = extractor_api - self._rag_api = rag_api - self._settings = settings - self._key_value_store = key_value_store - self._information_mapper = information_mapper - self._information_enhancer = information_enhancer - self._chunker = chunker - self._document_deleter = document_deleter - self._settings_mapper = settings_mapper - self._background_thread = None - self._document_key = None - - async def aload_from_confluence(self) -> None: - """ - Asynchronously loads content from Confluence using the configured settings. - - Raises - ------ - HTTPException - If the Confluence loader is not configured or if a load is already in progress. - """ - for index in range(len(self._settings.url)): - if not ( - self._settings.url[index].strip() - and self._settings.space_key[index].strip() - and self._settings.token[index].strip() - ): - raise HTTPException( - status.HTTP_501_NOT_IMPLEMENTED, - "The confluence loader is not configured! Required fields are missing.", - ) - - if self._background_thread is not None and self._background_thread.is_alive(): - raise HTTPException( - status.HTTP_423_LOCKED, "Confluence loader is locked... Please wait for the current load to finish." - ) - self._background_thread = Thread(target=lambda: run(self._aload_from_confluence())) - self._background_thread.start() - - async def _aload_from_confluence(self) -> None: - async def process_confluence(index): - logger.info("Loading from Confluence %s", self._settings.url[index]) - self._sanitize_document_name(index=index) - - params = self._settings_mapper.map_settings_to_params(self._settings, index) - try: - self._key_value_store.upsert(self._settings.document_name[index], Status.PROCESSING) - information_pieces = self._extractor_api.extract_from_confluence_post(params) - documents = [ - self._information_mapper.extractor_information_piece2document(x) for x in information_pieces - ] - documents = await self._aenhance_langchain_documents(documents) - chunked_documents = self._chunker.chunk(documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in chunked_documents - ] - except Exception as e: - self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR) - - logger.error("Error while loading from Confluence: %s", str(e)) - raise HTTPException( - status.HTTP_500_INTERNAL_SERVER_ERROR, f"Error loading from Confluence: {str(e)}" - ) from e - - await self._delete_previous_information_pieces(index=index) - self._key_value_store.upsert(self._settings.document_name[index], Status.UPLOADING) - self._upload_information_pieces(rag_information_pieces, index=index) - - threads = [] - for idx in range(len(self._settings.url)): - t = threading.Thread(target=lambda idx=idx: run(process_confluence(idx))) - threads.append(t) - t.start() - for t in threads: - t.join() - - async def _aenhance_langchain_documents(self, documents: list[Document]): - try: - return await self._information_enhancer.ainvoke(documents) - except Exception as e: - logger.error("Exception occured while enhancing confluence langchain document %s" % e) - raise e - - async def _delete_previous_information_pieces(self, index=0): - try: - await self._document_deleter.adelete_document(self._settings.document_name[index]) - except HTTPException as e: - logger.error( - ( - "Error while trying to delete documents with id: %s before uploading %s." - "NOTE: Still continuing with upload." - ), - self._settings.document_name[index], - e, - ) - - def _upload_information_pieces(self, rag_api_documents, index=0): - try: - self._rag_api.upload_information_piece(rag_api_documents) - self._key_value_store.upsert(self._settings.document_name[index], Status.READY) - logger.info("Confluence loaded successfully") - except Exception as e: - self._key_value_store.upsert(self._settings.document_name[index], Status.ERROR) - logger.error("Error while uploading Confluence to the database: %s", str(e)) - raise HTTPException(500, f"Error loading from Confluence: {str(e)}") from e - - def _sanitize_document_name(self, index) -> None: - document_name = ( - self._settings.document_name[index] if self._settings.document_name[index] else self._settings.url[index] - ) - document_name = document_name.replace("http://", "").replace("https://", "") - - self._settings.document_name[index] = sanitize_document_name(document_name) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py deleted file mode 100644 index 549be19..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Module for the DefaultDocumentUploader class.""" - -import logging -import tempfile -import traceback -import urllib -from asyncio import run -from pathlib import Path -from threading import Thread - -from fastapi import HTTPException, Request, UploadFile, status - -from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.api_endpoints.document_uploader import DocumentUploader -from admin_api_lib.chunker.chunker import Chunker -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( - ExtractorApi, -) -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ( - ExtractionRequest, -) -from admin_api_lib.file_services.file_service import FileService -from admin_api_lib.impl.key_db.file_status_key_value_store import ( - FileStatusKeyValueStore, -) -from admin_api_lib.impl.mapper.informationpiece2document import ( - InformationPiece2Document, -) -from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer -from admin_api_lib.models.status import Status -from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi -from admin_api_lib.utils.utils import sanitize_document_name - -logger = logging.getLogger(__name__) - - -class DefaultDocumentUploader(DocumentUploader): - """DefaultDocumentUploader is responsible for handling the upload, processing, and storage of documents.""" - - def __init__( - self, - document_extractor: ExtractorApi, - file_service: FileService, - rag_api: RagApi, - information_enhancer: InformationEnhancer, - information_mapper: InformationPiece2Document, - chunker: Chunker, - key_value_store: FileStatusKeyValueStore, - document_deleter: DocumentDeleter, - ): - """ - Initialize the DefaultDocumentUploader. - - Parameters - ---------- - document_extractor : ExtractorApi - The API for extracting documents. - file_service : FileService - The service for handling file operations on the S3 storage - rag_api : RagApi - The API for RAG backend. - information_enhancer : InformationEnhancer - The service for enhancing information. - information_mapper : InformationPiece2Document - The mapper for converting information pieces to langchain documents. - chunker : Chunker - The service for chunking documents into chunks. - key_value_store : FileStatusKeyValueStore - The key-value store for storing filename and the corresponding status. - document_deleter : DocumentDeleter - The service for deleting documents. - """ - self._document_extractor = document_extractor - self._file_service = file_service - self._rag_api = rag_api - self._information_enhancer = information_enhancer - self._information_mapper = information_mapper - self._chunker = chunker - self._key_value_store = key_value_store - self._document_deleter = document_deleter - self._background_threads = [] - - async def aupload_documents_post( - self, - body: UploadFile, - request: Request, - ) -> None: - """ - Handle the uploading of documents via a POST request. - - This asynchronous method reads the content of the uploaded file and starts a background - thread to save the document in S3 storage and the vector database. It updates the status - of the document in the key-value store and handles any exceptions that may occur during - the process. - - Parameters - ---------- - body : UploadFile - The uploaded file. - request : Request - The request object. - - Raises - ------ - HTTPException - If there is a ValueError, raises a 400 Bad Request error. - HTTPException - If there is any other exception, raises a 500 Internal Server Error. - """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] - content = await body.read() - body.filename = sanitize_document_name(body.filename) - try: - self._key_value_store.upsert(body.filename, Status.UPLOADING) - thread = Thread(target=lambda: run(self._asave_new_document(content, body.filename, request))) - thread.start() - self._background_threads.append(thread) - except ValueError as e: - self._key_value_store.upsert(body.filename, Status.ERROR) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) - except Exception as e: - self._key_value_store.upsert(body.filename, Status.ERROR) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) - - async def _asave_new_document( - self, - file_content: bytes, - filename: str, - request: Request, - ): - try: - await self._document_deleter.adelete_document(filename) - except HTTPException as e: - logger.error( - "Error while trying to delete file %s before uploading %s. Still continuing with upload.", filename, e - ) - self._key_value_store.upsert(filename, Status.ERROR) - - try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / filename - with open(temp_file_path, "wb") as temp_file: - logger.debug("Temporary file created at %s.", temp_file_path) - temp_file.write(file_content) - logger.debug("Temp file created and content written.") - - await self._aparse_document(Path(temp_file_path), request) - except Exception as e: - logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) - self._key_value_store.upsert(filename, Status.ERROR) - - async def _aparse_document( - self, - s3_file_path: Path, - request: Request, - ): - logger.debug("START parsing of the document %s", s3_file_path) - filename = s3_file_path.name - - self._file_service.upload_file(s3_file_path, filename) - self._key_value_store.upsert(filename, Status.PROCESSING) - - information_pieces = self._document_extractor.extract_from_file_post(ExtractionRequest(path_on_s3=filename)) - if not information_pieces: - self._key_value_store.upsert(filename, Status.ERROR) - logger.error("No information pieces found in the document: %s", filename) - raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail="No information pieces found") - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] - host_base_url = str(request.base_url) - document_url = f"{host_base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(filename)}" - - chunked_documents = self._chunker.chunk(documents) - - for idx, chunk in enumerate(chunked_documents): - if chunk.metadata["id"] in chunk.metadata["related"]: - chunk.metadata["related"].remove(chunk.metadata["id"]) - chunk.metadata.update( - { - "chunk": idx, - "chunk_length": len(chunk.page_content), - "document_url": document_url, - } - ) - - enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] - - self._rag_api.upload_information_piece(rag_information_pieces) - self._key_value_store.upsert(filename, Status.READY) - logger.info("File uploaded successfully: %s", filename) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 1b2f31c..d520293 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,10 +1,13 @@ from http.client import HTTPException import logging -from typing import Optional +import os +from pathlib import Path +from typing import Optional, Tuple, Union from threading import Thread import urllib +import tempfile -from pydantic import StrictStr +from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run @@ -16,7 +19,7 @@ from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name @@ -28,7 +31,7 @@ class DefaultSourceUploader(SourceUploader): def __init__( self, - extractor_api: ExtractorApi, + extractor_api: ExtractorApiClient, key_value_store: FileStatusKeyValueStore, information_enhancer: InformationEnhancer, chunker: Chunker, @@ -60,8 +63,14 @@ async def upload_source( self._key_value_store.upsert( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status + filename = None + if file: + content = await file.read() + filename = Path("/tmp/" + file.filename) + with open(filename, "wb") as tmpfile: + tmpfile.write(content) thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, file, kwargs)) + target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, filename, kwargs)) ) thread.start() self._background_threads.append(thread) @@ -79,11 +88,15 @@ async def _handle_source_upload( base_url: str, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], + file, #: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]], kwargs: Optional[list[KeyValuePair]], ): try: - information_pieces = self._extractor_api.extract(type, name, file, kwargs) + if file: + information_pieces = self._extractor_api.extract(type, source_name, str(file), kwargs) + else: + information_pieces = self._extractor_api.extract(type, source_name, None, kwargs) + if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) @@ -98,10 +111,16 @@ async def _handle_source_upload( ] # Replace old document - await self._document_deleter.adelete_document(source_name) + try: + await self._document_deleter.adelete_document(source_name) + except Exception as e: + # deletion is allowed to fail + pass self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) logger.info("File uploaded successfully: %s", source_name) + if file: + os.remove(file) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py b/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py deleted file mode 100644 index 552535f..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/confluence_settings_mapper.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Module for the ConfluenceSettingsMapper class.""" - -from admin_api_lib.extractor_api_client.openapi_client.models.confluence_parameters import ( - ConfluenceParameters, -) -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings - - -class ConfluenceSettingsMapper: - """Mapper class for converting ConfluenceSettings to ConfluenceParameters.""" - - @staticmethod - def map_settings_to_params(settings: ConfluenceSettings, index) -> ConfluenceParameters: - """ - Map ConfluenceSettings to ConfluenceParameters. - - Parameters - ---------- - settings : ConfluenceSettings - The settings object containing Confluence configuration. - - Returns - ------- - ConfluenceParameters - The parameters object for API consumption. - """ - return ConfluenceParameters( - url=settings.url[index], - token=settings.token[index], - space_key=settings.space_key[index], - include_attachments=settings.include_attachments[index], - keep_markdown_format=settings.keep_markdown_format[index], - keep_newlines=settings.keep_newlines[index], - document_name=settings.document_name[index], - confluence_kwargs=[{"key": "verify_ssl", "value": settings.verify_ssl[index]}], - ) diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py index a3a40ce..6f0ac2f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py +++ b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py @@ -4,10 +4,10 @@ from langchain_core.documents import Document as LangchainDocument -from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( +from admin_api_lib.extractor_api_client.models.content_type import ( ContentType as ExtractorInformaType, ) -from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( +from admin_api_lib.extractor_api_client.models.information_piece import ( InformationPiece as ExtractorInformationPiece, ) from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( diff --git a/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py b/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py deleted file mode 100644 index acf77fc..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/settings/confluence_settings.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Contains settings regarding the confluence.""" - -from typing import Optional -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList -from pydantic import Field, model_validator -from pydantic_settings import BaseSettings -import logging - -logger = logging.getLogger(__name__) - - -class ConfluenceSettings(BaseSettings): - """ - Contains configuration settings for the Confluence integration. - - Parameters - ---------- - url : CommaSeparatedStrList, optional - List of Confluence URLs. - token : CommaSeparatedStrList, optional - List of authentication tokens. - space_key : CommaSeparatedStrList, optional - List of Confluence space keys. - document_name : CommaSeparatedStrList, optional - List of document names. - verify_ssl : CommaSeparatedBoolList, optional - List of booleans indicating whether SSL verification is enabled. - include_attachments : CommaSeparatedBoolList, optional - Indicates whether to include attachments in the integration. - keep_markdown_format : CommaSeparatedBoolList, optional - Determines if markdown formatting is maintained. - keep_newlines : CommaSeparatedBoolList, optional - Indicates whether newlines are preserved. - """ - - class Config: - """Config class for reading Fields from env.""" - - env_prefix = "CONFLUENCE_" - case_sensitive = False - - url: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - token: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - space_key: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - document_name: Optional[CommaSeparatedStrList] = Field(default_factory=CommaSeparatedStrList) - verify_ssl: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - include_attachments: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - keep_markdown_format: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - keep_newlines: Optional[CommaSeparatedBoolList] = Field(default_factory=CommaSeparatedBoolList) - - @model_validator(mode="after") - def check_lists_length_consistency(cls, values): - """ - Validate that all list-valued settings have the same length. - - If not, the list is adjusted accordingly. - - Parameters - ---------- - values : dict - Dictionary of configuration settings. - - Returns - ------- - dict - The validated values dictionary with consistent list lengths. - - Raises - ------ - ValueError - If any non-optional list has a different length compared to others. - """ - # Define the keys to check - keys = [ - "url", - "token", - "space_key", - "document_name", - "verify_ssl", - "include_attachments", - "keep_markdown_format", - "keep_newlines", - ] - - lengths = {} - for key in keys: - value = getattr(values, key, None) - if value is not None: - lengths[key] = len(value) - # If there is more than one list with values, ensure they have the same length - optional_keys = ["document_name", "verify_ssl", "include_attachments", "keep_markdown_format", "keep_newlines"] - if lengths: - # Use the first encountered length as reference - ref_length = next(iter(lengths.values())) - for key, length in lengths.items(): - if length != ref_length and key not in optional_keys: - raise ValueError( - f"Confluence Settings length mismatch: Expected all lists to have {ref_length} elements, " - f"but '{key}' has {length} elements. {lengths}" - ) - - urls = getattr(values, "url", None) - if urls and len(urls) > 0: - n = len(urls) - try: - document_name = getattr(values, "document_name", None) - if not document_name or len(document_name) == 0: - values.document_name = CommaSeparatedStrList([""] * n) - elif len(document_name) != n: - raise ValueError("document_name list length mismatch") - except ValueError as e: - logger.error(f"Error setting document_name: {e}") - logger.warning("Setting document_name to default values") - document_name = getattr(values, "document_name", []) - values.document_name = CommaSeparatedStrList(document_name + [""] * (n - len(document_name))) - - try: - verify_ssl = getattr(values, "verify_ssl", None) - if not verify_ssl or len(verify_ssl) == 0: - values.verify_ssl = CommaSeparatedBoolList([True] * n) - elif len(verify_ssl) != n: - raise ValueError("verify_ssl list length mismatch") - except ValueError as e: - logger.error(f"Error setting verify_ssl: {e}") - logger.warning("Setting verify_ssl to default values") - verify_ssl = getattr(values, "verify_ssl", []) - values.verify_ssl = CommaSeparatedBoolList(verify_ssl + [True] * (n - len(verify_ssl))) - - try: - include_attachments = getattr(values, "include_attachments", None) - if not include_attachments or len(include_attachments) == 0: - values.include_attachments = CommaSeparatedBoolList([False] * n) - elif len(include_attachments) != n: - raise ValueError("include_attachments list length mismatch") - except ValueError as e: - logger.error(f"Error setting include_attachments: {e}") - logger.warning("Setting include_attachments to default values") - include_attachments = getattr(values, "include_attachments", []) - values.include_attachments = CommaSeparatedBoolList( - include_attachments + [False] * (n - len(include_attachments)) - ) - - try: - keep_markdown_format = getattr(values, "keep_markdown_format", None) - if not keep_markdown_format or len(keep_markdown_format) == 0: - values.keep_markdown_format = CommaSeparatedBoolList([True] * n) - elif len(keep_markdown_format) != n: - raise ValueError("keep_markdown_format list length mismatch") - except ValueError as e: - logger.error(f"Error setting keep_markdown_format: {e}") - logger.warning("Setting keep_markdown_format to default values") - keep_markdown_format = getattr(values, "keep_markdown_format", []) - values.keep_markdown_format = CommaSeparatedBoolList( - keep_markdown_format + [True] * (n - len(keep_markdown_format)) - ) - - try: - keep_newlines = getattr(values, "keep_newlines", None) - if not keep_newlines or len(keep_newlines) == 0: - values.keep_newlines = CommaSeparatedBoolList([True] * n) - elif len(keep_newlines) != n: - raise ValueError("keep_newlines list length mismatch") - except ValueError as e: - logger.error(f"Error setting keep_newlines: {e}") - logger.warning("Setting keep_newlines to default values") - keep_newlines = getattr(values, "keep_newlines", []) - values.keep_newlines = CommaSeparatedBoolList(keep_newlines + [True] * (n - len(keep_newlines))) - - return values diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index 262f11b..81ca3e2 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -86,8 +86,7 @@ components: properties: file: description: "" - format: binary - type: string + type: file type: description: "" type: string diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 0cbdc2b..38c9a1d 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -51,7 +51,7 @@ async def extract( type: Annotated[str, Form()], name: Annotated[str, Form()], file: Optional[UploadFile] = None, - kwargs: Optional[Annotated[List[KeyValuePair], Form()]]=None, + kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 8b54f1c..553d79a 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -35,7 +35,7 @@ def compatible_file_types(self) -> list[FileType]: """ @abstractmethod - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from given file. @@ -43,7 +43,9 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie ---------- file_path : Path Path to the file the information should be extracted from. - + name : str + Name of the document. + Returns ------- list[InformationPiece] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index cb04681..c67425d 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -12,7 +12,6 @@ from unstructured.partition.pptx import partition_pptx - from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor @@ -54,7 +53,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.DOCX, FileType.PPTX] - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from a given file based on its extension. @@ -62,7 +61,8 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie ---------- file_path : Path The path to the file from which content is to be extracted. - + name : str + Name of the document. Returns ------- list[InformationPiece] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index 01eb6bf..8d5bd35 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -37,7 +37,7 @@ class PDFExtractor(InformationFileExtractor): Attributes ---------- TITLE_PATTERN : re.Pattern - Regular expression pattern to identify titles in the text. + Regular expression pattern to identify titles in the text.document TITLE_PATTERN_MULTILINE : re.Pattern Regular expression pattern to identify titles in the text with multiline support. """ @@ -104,13 +104,15 @@ def _create_information_piece( page_content=content, ) - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """Extract content from given file. Parameters ---------- file_path : Path Path to the file the information should be extracted from. + name : str + Name of the document. Returns ------- @@ -136,7 +138,7 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie page=page, temp_dir=temp_dir, title=current_title, - document_name=file_path.name, + document_name=name, ) pdf_elements += new_pdf_elements diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index 2a9d21c..e7523b6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -45,7 +45,7 @@ def compatible_file_types(self) -> list[FileType]: """ return [FileType.XML] - async def aextract_content(self, file_path: Path) -> list[InternalInformationPiece]: + async def aextract_content(self, file_path: Path, name: str) -> list[InternalInformationPiece]: """ Extract content from an XML file and processes the elements. @@ -53,6 +53,8 @@ async def aextract_content(self, file_path: Path) -> list[InternalInformationPie ---------- file_path : Path The path to the XML file to be processed. + name : str + Name of the document. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py index dfb7031..04abb2c 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py @@ -40,7 +40,7 @@ def __init__(self, file_service: FileService, available_extractors: list[Informa available_extractors : list of InformationExtractor A list of available information extractors to be used by the GeneralExtractor. """ - self._file_service=file_service + self._file_service = file_service self._available_extractors = available_extractors @property @@ -84,7 +84,7 @@ async def aextract_content( ] if not correct_extractors: raise ValueError(f"No extractor found for file-ending {file_type}") - return await correct_extractors[-1].aextract_content(temp_file_path) + return await correct_extractors[-1].aextract_content(temp_file_path, name) except Exception as e: logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) raise e diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index 64597dd..fb432c6 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -3,6 +3,10 @@ # coding: utf-8 # flake8: noqa: D105 +from asyncio import FIRST_COMPLETED, CancelledError, create_task, wait +from contextlib import suppress +import logging +from time import sleep from typing import Dict, List # noqa: F401 import importlib import pkgutil @@ -32,6 +36,7 @@ from rag_core_api.models.extra_models import TokenModel # noqa: F401 from pydantic import Field, StrictStr from typing import Any, List +import logging from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse From 2e591c3900c457497d270f01a3add5120e0e1536 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:16:20 +0200 Subject: [PATCH 06/56] wip --- .../src/admin_api_lib/apis/admin_api.py | 55 +++++-- .../src/admin_api_lib/apis/admin_api_base.py | 18 +-- .../api_endpoints/default_source_uploader.py | 2 +- .../admin_api_lib/models/document_status.py | 6 +- .../models/http_validation_error.py | 101 ++++++++++++ .../admin_api_lib/models/key_value_pair.py | 24 +-- .../src/admin_api_lib/models/status.py | 4 +- .../src/admin_api_lib/models/upload_source.py | 8 +- .../admin_api_lib/models/validation_error.py | 105 ++++++++++++ .../models/validation_error_loc_inner.py | 153 ++++++++++++++++++ 10 files changed, 429 insertions(+), 47 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/models/http_validation_error.py create mode 100644 admin-api-lib/src/admin_api_lib/models/validation_error.py create mode 100644 admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 9d32286..15f8438 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,6 +1,6 @@ # coding: utf-8 -from typing import Dict, List, Annotated # noqa: F401 +from typing import Dict, List # noqa: F401 import importlib import pkgutil @@ -26,9 +26,10 @@ from admin_api_lib.models.extra_models import TokenModel # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Tuple, Union from typing_extensions import Annotated from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair @@ -44,12 +45,14 @@ responses={ 200: {"description": "Deleted"}, 500: {"description": "Internal server error"}, + 422: {"model": HTTPValidationError, "description": "Validation Error"}, }, tags=["admin"], + summary="Delete Document", response_model_by_alias=True, ) async def delete_document( - identification: str = Path(..., description=""), + identification: StrictStr = Path(..., description=""), ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -75,12 +78,16 @@ async def delete_document( 400: {"model": str, "description": "Bad request"}, 404: {"model": str, "description": "Document not found."}, 500: {"model": str, "description": "Internal server error"}, + 422: {"model": HTTPValidationError, "description": "Validation Error"}, }, tags=["admin"], + summary="Document Reference Id Get", response_model_by_alias=True, ) -async def document_reference_id_get( - identification: str = Path(..., description="Identifier of the pdf document."), +async def document_reference( + identification: Annotated[StrictStr, Field(description="Identifier of the document.")] = Path( + ..., description="Identifier of the document." + ), ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -97,7 +104,7 @@ async def document_reference_id_get( """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().document_reference_id_get(identification) + return await BaseAdminApi.subclasses[0]().document_reference(identification) @router.get( @@ -107,6 +114,7 @@ async def document_reference_id_get( 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Get All Documents Status", response_model_by_alias=True, ) async def get_all_documents_status() -> List[DocumentStatus]: @@ -123,25 +131,48 @@ async def get_all_documents_status() -> List[DocumentStatus]: return await BaseAdminApi.subclasses[0]().get_all_documents_status() +@router.post( + "/upload_file", + responses={ + 200: {"model": object, "description": "ok"}, + 400: {"description": "Bad request"}, + 422: {"description": "Unprocessable Content"}, + 500: {"description": "Internal server error"}, + }, + tags=["admin"], + summary="Upload File", + response_model_by_alias=True, +) +async def upload_file( + file: UploadFile, + request: Request, +) -> object: + """Uploads user selected sources.""" + if not BaseAdminApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseAdminApi.subclasses[0]().upload_file(file) + + @router.post( "/upload_source", responses={ 200: {"description": "ok"}, 400: {"description": "Bad request"}, - 422: {"description": "If no text has been extracted from the file."}, + 422: {"description": "Unprocessable Content"}, 500: {"description": "Internal server error"}, }, tags=["admin"], + summary="Upload Source", response_model_by_alias=True, ) async def upload_source( + request: Request, - type: Annotated[str, Form()], - name: Annotated[str, Form()], - file: Optional[UploadFile] = None, - kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, + type: StrictStr = Query(None, description="", alias="type"), + name: StrictStr = Query(None, description="", alias="name"), + key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, file, kwargs, request) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 8aebb8b..8835113 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -3,11 +3,12 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Tuple, Union from typing_extensions import Annotated -from fastapi import Request, Response, UploadFile from admin_api_lib.models.document_status import DocumentStatus +from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair +from fastapi import Request, Response, UploadFile class BaseAdminApi: @@ -20,7 +21,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, identification: StrictStr, - ) -> None: + ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -34,9 +35,9 @@ async def delete_document( None """ - async def document_reference_id_get( + async def document_reference( self, - identification: Annotated[StrictStr, Field(description="Identifier of the pdf document.")], + identification: Annotated[StrictStr, Field(description="Identifier of the document.")], ) -> Response: """ Asynchronously retrieve a document reference by its identification. @@ -68,9 +69,8 @@ async def upload_source( self, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], + key_value_pair: List[KeyValuePair], request: Request, ) -> None: - """Uploads user selected sources.""" - ... + """Uploads user selected source.""" + diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index d520293..196d8a7 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -92,7 +92,7 @@ async def _handle_source_upload( kwargs: Optional[list[KeyValuePair]], ): try: - if file: + if file: information_pieces = self._extractor_api.extract(type, source_name, str(file), kwargs) else: information_pieces = self._extractor_api.extract(type, source_name, None, kwargs) diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index fedce07..ff2f94a 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -29,7 +29,9 @@ class DocumentStatus(BaseModel): - """ """ # noqa: E501 + """ + DocumentStatus + """ # noqa: E501 name: StrictStr status: Status diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py new file mode 100644 index 0000000..40f6013 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -0,0 +1,101 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict +from typing import Any, ClassVar, Dict, List, Optional +from admin_api_lib.models.validation_error import ValidationError + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class HTTPValidationError(BaseModel): + """ + HTTPValidationError + """ # noqa: E501 + + detail: Optional[List[ValidationError]] = None + __properties: ClassVar[List[str]] = ["detail"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of HTTPValidationError from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in detail (list) + _items = [] + if self.detail: + for _item in self.detail: + if _item: + _items.append(_item.to_dict()) + _dict["detail"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of HTTPValidationError from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "detail": ( + [ValidationError.from_dict(_item) for _item in obj.get("detail")] + if obj.get("detail") is not None + else None + ) + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py index 2d2fe5e..82c0c37 100644 --- a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -18,8 +18,8 @@ import json -from pydantic import BaseModel, ConfigDict -from typing import Any, ClassVar, Dict, List, Optional +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -28,10 +28,12 @@ class KeyValuePair(BaseModel): - """ """ # noqa: E501 + """ + KeyValuePair + """ # noqa: E501 - key: Optional[Any] = None - value: Optional[Any] = None + key: StrictStr + value: StrictStr __properties: ClassVar[List[str]] = ["key", "value"] model_config = { @@ -69,16 +71,6 @@ def to_dict(self) -> Dict[str, Any]: exclude={}, exclude_none=True, ) - # set to None if key (nullable) is None - # and model_fields_set contains the field - if self.key is None and "key" in self.model_fields_set: - _dict["key"] = None - - # set to None if value (nullable) is None - # and model_fields_set contains the field - if self.value is None and "value" in self.model_fields_set: - _dict["value"] = None - return _dict @classmethod diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 5c7836f..e4ac64b 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -26,8 +26,6 @@ class Status(str, Enum): - """ """ - """ allowed enum values """ diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py index e90690f..1d86e38 100644 --- a/admin-api-lib/src/admin_api_lib/models/upload_source.py +++ b/admin-api-lib/src/admin_api_lib/models/upload_source.py @@ -17,7 +17,7 @@ import re # noqa: F401 import json - +from fastapi import UploadFile from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from admin_api_lib.models.key_value_pair import KeyValuePair @@ -31,10 +31,10 @@ class UploadSource(BaseModel): """ """ # noqa: E501 - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None type: StrictStr + name: StrictStr kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] + __properties: ClassVar[List[str]] = ["type", "name", "kwargs"] model_config = { "populate_by_name": True, @@ -91,7 +91,7 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { - "file": obj.get("file"), + "name": obj.get("name"), "type": obj.get("type"), "kwargs": ( [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error.py b/admin-api-lib/src/admin_api_lib/models/validation_error.py new file mode 100644 index 0000000..f922b21 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/validation_error.py @@ -0,0 +1,105 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from admin_api_lib.models.validation_error_loc_inner import ValidationErrorLocInner + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ValidationError(BaseModel): + """ + ValidationError + """ # noqa: E501 + + loc: List[ValidationErrorLocInner] + msg: StrictStr + type: StrictStr + __properties: ClassVar[List[str]] = ["loc", "msg", "type"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ValidationError from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in loc (list) + _items = [] + if self.loc: + for _item in self.loc: + if _item: + _items.append(_item.to_dict()) + _dict["loc"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ValidationError from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "loc": ( + [ValidationErrorLocInner.from_dict(_item) for _item in obj.get("loc")] + if obj.get("loc") is not None + else None + ), + "msg": obj.get("msg"), + "type": obj.get("type"), + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py new file mode 100644 index 0000000..8cd53fe --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -0,0 +1,153 @@ +# coding: utf-8 + +""" +admin-api-lib + +The API is used for the communication between the admin frontend and the admin backend in the rag project. + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +from inspect import getfullargspec +import json +import pprint +import re # noqa: F401 + + +from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr, ValidationError, field_validator +from typing import Optional +from typing import Union, Any, List, TYPE_CHECKING, Optional, Dict +from typing_extensions import Literal +from pydantic import StrictStr, Field + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + +VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS = ["int", "str"] + + +class ValidationErrorLocInner(BaseModel): + """ + ValidationErrorLocInner + """ + + # data type: str + anyof_schema_1_validator: Optional[StrictStr] = None + # data type: int + anyof_schema_2_validator: Optional[StrictInt] = None + if TYPE_CHECKING: + actual_instance: Optional[Union[int, str]] = None + else: + actual_instance: Any = None + any_of_schemas: List[str] = Literal[VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS] + + model_config = { + "validate_assignment": True, + "protected_namespaces": (), + } + + def __init__(self, *args, **kwargs) -> None: + if args: + if len(args) > 1: + raise ValueError("If a position argument is used, only 1 is allowed to set `actual_instance`") + if kwargs: + raise ValueError("If a position argument is used, keyword arguments cannot be used.") + super().__init__(actual_instance=args[0]) + else: + super().__init__(**kwargs) + + @field_validator("actual_instance") + def actual_instance_must_validate_anyof(cls, v): + instance = ValidationErrorLocInner.model_construct() + error_messages = [] + # validate data type: str + try: + instance.anyof_schema_1_validator = v + return v + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + # validate data type: int + try: + instance.anyof_schema_2_validator = v + return v + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + if error_messages: + # no match + raise ValueError( + "No match found when setting the actual_instance in ValidationErrorLocInner with anyOf schemas: int, str. Details: " + + ", ".join(error_messages) + ) + else: + return v + + @classmethod + def from_dict(cls, obj: dict) -> Self: + return cls.from_json(json.dumps(obj)) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Returns the object represented by the json string""" + instance = cls.model_construct() + error_messages = [] + # deserialize data into str + try: + # validation + instance.anyof_schema_1_validator = json.loads(json_str) + # assign value to actual_instance + instance.actual_instance = instance.anyof_schema_1_validator + return instance + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + # deserialize data into int + try: + # validation + instance.anyof_schema_2_validator = json.loads(json_str) + # assign value to actual_instance + instance.actual_instance = instance.anyof_schema_2_validator + return instance + except (ValidationError, ValueError) as e: + error_messages.append(str(e)) + + if error_messages: + # no match + raise ValueError( + "No match found when deserializing the JSON string into ValidationErrorLocInner with anyOf schemas: int, str. Details: " + + ", ".join(error_messages) + ) + else: + return instance + + def to_json(self) -> str: + """Returns the JSON representation of the actual instance""" + if self.actual_instance is None: + return "null" + + to_json = getattr(self.actual_instance, "to_json", None) + if callable(to_json): + return self.actual_instance.to_json() + else: + return json.dumps(self.actual_instance) + + def to_dict(self) -> Dict: + """Returns the dict representation of the actual instance""" + if self.actual_instance is None: + return "null" + + to_json = getattr(self.actual_instance, "to_json", None) + if callable(to_json): + return self.actual_instance.to_dict() + else: + # primitive type + return self.actual_instance + + def to_str(self) -> str: + """Returns the string representation of the actual instance""" + return pprint.pformat(self.model_dump()) From cf8b892d4010d075d818e5ead78854585dc06085 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:22:53 +0200 Subject: [PATCH 07/56] wip --- admin-api-lib/openapi.yaml | 597 ++++++++++++++++++++++++++----------- 1 file changed, 428 insertions(+), 169 deletions(-) diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index efbb2f6..0c1d883 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,169 +1,428 @@ -openapi: 3.0.2 -info: - description: The API is used for the communication between the admin frontend and - the admin backend in the rag project. - title: admin-api-lib - version: 1.0.0 -servers: -- url: / -paths: - /document_reference/{identification}: - get: - operationId: document_reference_id_get - parameters: - - description: Identifier of the pdf document. - explode: false - in: path - name: identification - required: true - schema: - type: string - style: simple - responses: - "200": - content: - application/pdf: - schema: - format: binary - type: string - description: Returns the pdf in binary form. - "400": - content: - application/json: - schema: - type: string - description: Bad request - "404": - content: - application/json: - schema: - type: string - description: Document not found. - "500": - content: - application/json: - schema: - type: string - description: Internal server error - tags: - - admin - /delete_document/{identification}: - delete: - operationId: delete_document - parameters: - - explode: false - in: path - name: identification - required: true - schema: - type: string - style: simple - responses: - "200": - description: Deleted - "500": - description: Internal server error - tags: - - admin - /all_documents_status: - get: - operationId: get_all_documents_status - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/document_status' - type: array - description: List of document links - "500": - description: Internal server error - tags: - - admin - /upload_source: - post: - description: Uploads user selected sources. - operationId: upload_source - requestBody: - content: - multipart/form-data: - schema: - $ref: '#/components/schemas/upload_source' - description: The source to upload. - required: true - responses: - "200": - description: ok - "400": - description: Bad request - "422": - description: If no text has been extracted from the file. - "500": - description: Internal server error - tags: - - admin -components: - schemas: - status: - description: "" - enum: - - UPLOADING - - PROCESSING - - READY - - ERROR - title: status - type: string - document_status: - description: "" - example: - name: name - status: UPLOADING - properties: - name: - description: "" - title: name - type: string - status: - $ref: '#/components/schemas/status' - required: - - name - - status - title: document_status - type: object - upload_source: - description: "" - properties: - file: - description: "" - format: binary - type: string - type: - description: "" - type: string - kwargs: - description: "" - items: - $ref: '#/components/schemas/key_value_pair' - type: array - name: - description: "" - type: string - required: - - name - - type - type: object - key_value_pair: - description: "" - example: - value: value - key: key - properties: - key: - description: "" - title: Key - value: - description: "" - title: Value - title: MetaInformationPiece - type: object +{ + "openapi": "3.1.0", + "info": { + "title": "admin-api-lib", + "description": "The API is used for the communication between the admin frontend and the admin backend in the rag project.", + "version": "1.0.0" + }, + "servers": [ + { + "url": "/api" + } + ], + "paths": { + "/delete_document/{identification}": { + "delete": { + "tags": [ + "admin" + ], + "summary": "Delete Document", + "description": "Asynchronously deletes a document based on the provided identification.\n\nParameters\n----------\nidentification : str\n The unique identifier of the document to be deleted.\n\nReturns\n-------\nNone", + "operationId": "delete_document", + "parameters": [ + { + "name": "identification", + "in": "path", + "required": true, + "schema": { + "type": "string", + "description": "", + "title": "Identification" + } + } + ], + "responses": { + "200": { + "description": "Deleted", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "500": { + "description": "Internal server error" + } + } + } + }, + "/document_reference/{identification}": { + "get": { + "tags": [ + "admin" + ], + "summary": "Document Reference Id Get", + "description": "Asynchronously retrieve a document reference by its identification.\n\nParameters\n----------\nidentification : str\n The unique identifier for the document reference.\n\nReturns\n-------\nResponse\n The response object containing the document reference details.", + "operationId": "document_reference", + "parameters": [ + { + "name": "identification", + "in": "path", + "required": true, + "schema": { + "type": "string", + "description": "Identifier of the document.", + "title": "Identification" + }, + "description": "Identifier of the document." + } + ], + "responses": { + "200": { + "description": "Returns the pdf in binary form.", + "content": { + "application/json": { + "schema": { + "type": "string", + "format": "binary", + "title": "Response 200 Document Reference Document Reference Identification Get" + } + } + } + }, + "400": { + "description": "Bad request", + "content": { + "application/json": { + "schema": { + "type": "string", + "title": "Response 400 Document Reference Document Reference Identification Get" + } + } + } + }, + "404": { + "description": "Document not found.", + "content": { + "application/json": { + "schema": { + "type": "string", + "title": "Response 404 Document Reference Document Reference Identification Get" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "type": "string", + "title": "Response 500 Document Reference Document Reference Identification Get" + } + } + } + } + } + } + }, + "/all_documents_status": { + "get": { + "tags": [ + "admin" + ], + "summary": "Get All Documents Status", + "description": "Asynchronously retrieves the status of all documents.\n\nReturns\n-------\nlist[DocumentStatus]\n A list containing the status of all documents.", + "operationId": "get_all_documents_status", + "responses": { + "200": { + "description": "List of document links", + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/DocumentStatus" + }, + "type": "array", + "title": "Response 200 Get All Documents Status All Documents Status Get" + } + } + } + }, + "500": { + "description": "Internal server error" + } + } + } + }, + "/upload_file": { + "post": { + "tags": [ + "admin" + ], + "summary": "Upload File", + "description": "Uploads user selected sources.", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_upload_file_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "ok", + "content": { + "application/json": { + "schema": { + "title": "Response 200 Upload File Upload File Post" + } + } + } + }, + "400": { + "description": "Bad request" + }, + "422": { + "description": "Unprocessable Content" + }, + "500": { + "description": "Internal server error" + } + } + } + }, + "/upload_source": { + "post": { + "tags": [ + "admin" + ], + "summary": "Upload Source", + "description": "Uploads user selected sources.", + "operationId": "upload_source", + "parameters": [ + { + "name": "type", + "in": "query", + "required": false, + "schema": { + "type": "string", + "description": "", + "title": "Type" + } + }, + { + "name": "name", + "in": "query", + "required": false, + "schema": { + "type": "string", + "description": "", + "title": "Name" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/KeyValuePair" + }, + "description": "", + "title": "Key Value Pair" + } + } + } + }, + "responses": { + "200": { + "description": "ok", + "content": { + "application/json": { + "schema": {} + } + } + }, + "400": { + "description": "Bad request" + }, + "422": { + "description": "Unprocessable Content" + }, + "500": { + "description": "Internal server error" + } + } + } + } + }, + "components": { + "schemas": { + "Body_upload_file_upload_file_post": { + "properties": { + "file": { + "type": "string", + "format": "binary", + "title": "File" + } + }, + "type": "object", + "required": [ + "file" + ], + "title": "Body_upload_file_upload_file_post" + }, + "DocumentStatus": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "status": { + "$ref": "#/components/schemas/Status" + } + }, + "type": "object", + "required": [ + "name", + "status" + ], + "title": "DocumentStatus", + "description": "DocumentStatus" + }, + "HTTPValidationError": { + "properties": { + "detail": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError", + "description": "HTTPValidationError" + }, + "KeyValuePair": { + "properties": { + "key": { + "type": "string", + "title": "Key" + }, + "value": { + "type": "string", + "title": "Value" + } + }, + "type": "object", + "required": [ + "key", + "value" + ], + "title": "KeyValuePair", + "description": "KeyValuePair" + }, + "Status": { + "type": "string", + "enum": [ + "UPLOADING", + "PROCESSING", + "READY", + "ERROR" + ], + "title": "Status", + "description": "allowed enum values" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "$ref": "#/components/schemas/ValidationErrorLocInner" + }, + "type": "array", + "title": "Loc" + }, + "msg": { + "type": "string", + "title": "Msg" + }, + "type": { + "type": "string", + "title": "Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError", + "description": "ValidationError" + }, + "ValidationErrorLocInner": { + "properties": { + "anyof_schema_1_validator": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Anyof Schema 1 Validator" + }, + "anyof_schema_2_validator": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Anyof Schema 2 Validator" + }, + "actual_instance": { + "title": "Actual Instance" + }, + "any_of_schemas": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Any Of Schemas" + } + }, + "type": "object", + "title": "ValidationErrorLocInner", + "description": "ValidationErrorLocInner" + } + } + } +} \ No newline at end of file From 8ee912cf0d9c4d3851e5e0bec478259460167954 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:26:23 +0200 Subject: [PATCH 08/56] wip --- admin-api-lib/openapi.yaml | 792 ++++++++---------- .../src/admin_api_lib/apis/admin_api_base.py | 7 + .../models/http_validation_error.py | 5 + .../src/admin_api_lib/models/status.py | 4 + .../src/admin_api_lib/models/upload_source.py | 103 --- .../models/validation_error_loc_inner.py | 182 ++-- 6 files changed, 452 insertions(+), 641 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/models/upload_source.py diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index 0c1d883..86d433a 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,428 +1,364 @@ -{ - "openapi": "3.1.0", - "info": { - "title": "admin-api-lib", - "description": "The API is used for the communication between the admin frontend and the admin backend in the rag project.", - "version": "1.0.0" - }, - "servers": [ - { - "url": "/api" - } - ], - "paths": { - "/delete_document/{identification}": { - "delete": { - "tags": [ - "admin" - ], - "summary": "Delete Document", - "description": "Asynchronously deletes a document based on the provided identification.\n\nParameters\n----------\nidentification : str\n The unique identifier of the document to be deleted.\n\nReturns\n-------\nNone", - "operationId": "delete_document", - "parameters": [ - { - "name": "identification", - "in": "path", - "required": true, - "schema": { - "type": "string", - "description": "", - "title": "Identification" - } - } - ], - "responses": { - "200": { - "description": "Deleted", - "content": { - "application/json": { - "schema": {} - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - }, - "500": { - "description": "Internal server error" - } - } - } - }, - "/document_reference/{identification}": { - "get": { - "tags": [ - "admin" - ], - "summary": "Document Reference Id Get", - "description": "Asynchronously retrieve a document reference by its identification.\n\nParameters\n----------\nidentification : str\n The unique identifier for the document reference.\n\nReturns\n-------\nResponse\n The response object containing the document reference details.", - "operationId": "document_reference", - "parameters": [ - { - "name": "identification", - "in": "path", - "required": true, - "schema": { - "type": "string", - "description": "Identifier of the document.", - "title": "Identification" - }, - "description": "Identifier of the document." - } - ], - "responses": { - "200": { - "description": "Returns the pdf in binary form.", - "content": { - "application/json": { - "schema": { - "type": "string", - "format": "binary", - "title": "Response 200 Document Reference Document Reference Identification Get" - } - } - } - }, - "400": { - "description": "Bad request", - "content": { - "application/json": { - "schema": { - "type": "string", - "title": "Response 400 Document Reference Document Reference Identification Get" - } - } - } - }, - "404": { - "description": "Document not found.", - "content": { - "application/json": { - "schema": { - "type": "string", - "title": "Response 404 Document Reference Document Reference Identification Get" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "type": "string", - "title": "Response 500 Document Reference Document Reference Identification Get" - } - } - } - } - } - } - }, - "/all_documents_status": { - "get": { - "tags": [ - "admin" - ], - "summary": "Get All Documents Status", - "description": "Asynchronously retrieves the status of all documents.\n\nReturns\n-------\nlist[DocumentStatus]\n A list containing the status of all documents.", - "operationId": "get_all_documents_status", - "responses": { - "200": { - "description": "List of document links", - "content": { - "application/json": { - "schema": { - "items": { - "$ref": "#/components/schemas/DocumentStatus" - }, - "type": "array", - "title": "Response 200 Get All Documents Status All Documents Status Get" - } - } - } - }, - "500": { - "description": "Internal server error" - } - } - } - }, - "/upload_file": { - "post": { - "tags": [ - "admin" - ], - "summary": "Upload File", - "description": "Uploads user selected sources.", - "operationId": "upload_file", - "requestBody": { - "content": { - "multipart/form-data": { - "schema": { - "$ref": "#/components/schemas/Body_upload_file_upload_file_post" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "ok", - "content": { - "application/json": { - "schema": { - "title": "Response 200 Upload File Upload File Post" - } - } - } - }, - "400": { - "description": "Bad request" - }, - "422": { - "description": "Unprocessable Content" - }, - "500": { - "description": "Internal server error" - } - } - } - }, - "/upload_source": { - "post": { - "tags": [ - "admin" - ], - "summary": "Upload Source", - "description": "Uploads user selected sources.", - "operationId": "upload_source", - "parameters": [ - { - "name": "type", - "in": "query", - "required": false, - "schema": { - "type": "string", - "description": "", - "title": "Type" - } - }, - { - "name": "name", - "in": "query", - "required": false, - "schema": { - "type": "string", - "description": "", - "title": "Name" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/KeyValuePair" - }, - "description": "", - "title": "Key Value Pair" - } - } - } - }, - "responses": { - "200": { - "description": "ok", - "content": { - "application/json": { - "schema": {} - } - } - }, - "400": { - "description": "Bad request" - }, - "422": { - "description": "Unprocessable Content" - }, - "500": { - "description": "Internal server error" - } - } - } - } - }, - "components": { - "schemas": { - "Body_upload_file_upload_file_post": { - "properties": { - "file": { - "type": "string", - "format": "binary", - "title": "File" - } - }, - "type": "object", - "required": [ - "file" - ], - "title": "Body_upload_file_upload_file_post" - }, - "DocumentStatus": { - "properties": { - "name": { - "type": "string", - "title": "Name" - }, - "status": { - "$ref": "#/components/schemas/Status" - } - }, - "type": "object", - "required": [ - "name", - "status" - ], - "title": "DocumentStatus", - "description": "DocumentStatus" - }, - "HTTPValidationError": { - "properties": { - "detail": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Detail" - } - }, - "type": "object", - "title": "HTTPValidationError", - "description": "HTTPValidationError" - }, - "KeyValuePair": { - "properties": { - "key": { - "type": "string", - "title": "Key" - }, - "value": { - "type": "string", - "title": "Value" - } - }, - "type": "object", - "required": [ - "key", - "value" - ], - "title": "KeyValuePair", - "description": "KeyValuePair" - }, - "Status": { - "type": "string", - "enum": [ - "UPLOADING", - "PROCESSING", - "READY", - "ERROR" - ], - "title": "Status", - "description": "allowed enum values" - }, - "ValidationError": { - "properties": { - "loc": { - "items": { - "$ref": "#/components/schemas/ValidationErrorLocInner" - }, - "type": "array", - "title": "Loc" - }, - "msg": { - "type": "string", - "title": "Msg" - }, - "type": { - "type": "string", - "title": "Type" - } - }, - "type": "object", - "required": [ - "loc", - "msg", - "type" - ], - "title": "ValidationError", - "description": "ValidationError" - }, - "ValidationErrorLocInner": { - "properties": { - "anyof_schema_1_validator": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Anyof Schema 1 Validator" - }, - "anyof_schema_2_validator": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "title": "Anyof Schema 2 Validator" - }, - "actual_instance": { - "title": "Actual Instance" - }, - "any_of_schemas": { - "items": { - "type": "string" - }, - "type": "array", - "title": "Any Of Schemas" - } - }, - "type": "object", - "title": "ValidationErrorLocInner", - "description": "ValidationErrorLocInner" - } - } - } -} \ No newline at end of file +openapi: 3.1.0 +info: + description: The API is used for the communication between the admin frontend + and the admin backend in the rag project. + title: admin-api-lib + version: 1.0.0 +servers: +- url: /api +paths: + /delete_document/{identification}: + delete: + description: |- + Asynchronously deletes a document based on the provided identification. + + Parameters + ---------- + identification : str + The unique identifier of the document to be deleted. + + Returns + ------- + None + operationId: delete_document + parameters: + - explode: false + in: path + name: identification + required: true + schema: + description: "" + title: Identification + type: string + style: simple + responses: + "200": + content: + application/json: + schema: {} + description: Deleted + "422": + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + "500": + description: Internal server error + summary: Delete Document + tags: + - admin + /document_reference/{identification}: + get: + description: |- + Asynchronously retrieve a document reference by its identification. + + Parameters + ---------- + identification : str + The unique identifier for the document reference. + + Returns + ------- + Response + The response object containing the document reference details. + operationId: document_reference + parameters: + - description: Identifier of the document. + explode: false + in: path + name: identification + required: true + schema: + description: Identifier of the document. + title: Identification + type: string + style: simple + responses: + "200": + content: + application/json: + schema: + format: binary + title: Response 200 Document Reference Document Reference Identification Get + type: string + description: Returns the pdf in binary form. + "400": + content: + application/json: + schema: + title: Response 400 Document Reference Document Reference Identification Get + type: string + description: Bad request + "404": + content: + application/json: + schema: + title: Response 404 Document Reference Document Reference Identification Get + type: string + description: Document not found. + "422": + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + "500": + content: + application/json: + schema: + title: Response 500 Document Reference Document Reference Identification Get + type: string + description: Internal server error + summary: Document Reference Id Get + tags: + - admin + /all_documents_status: + get: + description: |- + Asynchronously retrieves the status of all documents. + + Returns + ------- + list[DocumentStatus] + A list containing the status of all documents. + operationId: get_all_documents_status + responses: + "200": + content: + application/json: + schema: + items: + $ref: '#/components/schemas/DocumentStatus' + type: array + description: List of document links + "500": + description: Internal server error + summary: Get All Documents Status + tags: + - admin + /upload_file: + post: + description: Uploads user selected sources. + operationId: upload_file + requestBody: + content: + multipart/form-data: + schema: + $ref: '#/components/schemas/Body_upload_file_upload_file_post' + required: true + responses: + "200": + content: + application/json: + schema: {} + description: ok + "400": + description: Bad request + "422": + description: Unprocessable Content + "500": + description: Internal server error + summary: Upload File + tags: + - admin + /upload_source: + post: + description: Uploads user selected sources. + operationId: upload_source + parameters: + - explode: true + in: query + name: type + required: false + schema: + description: "" + title: Type + type: string + style: form + - explode: true + in: query + name: name + required: false + schema: + description: "" + title: Name + type: string + style: form + requestBody: + content: + application/json: + schema: + description: "" + items: + $ref: '#/components/schemas/KeyValuePair' + type: array + responses: + "200": + content: + application/json: + schema: {} + description: ok + "400": + description: Bad request + "422": + description: Unprocessable Content + "500": + description: Internal server error + summary: Upload Source + tags: + - admin +components: + schemas: + Body_upload_file_upload_file_post: + properties: + file: + format: binary + title: File + type: string + required: + - file + title: Body_upload_file_upload_file_post + DocumentStatus: + description: DocumentStatus + example: + name: name + status: UPLOADING + properties: + name: + title: Name + type: string + status: + $ref: '#/components/schemas/Status' + required: + - name + - status + title: DocumentStatus + HTTPValidationError: + description: HTTPValidationError + example: + detail: + - msg: msg + loc: + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + - msg: msg + loc: + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + properties: + detail: + items: + $ref: '#/components/schemas/ValidationError' + nullable: true + title: detail + type: array + title: HTTPValidationError + KeyValuePair: + description: KeyValuePair + example: + value: value + key: key + properties: + key: + title: Key + type: string + value: + title: Value + type: string + required: + - key + - value + title: KeyValuePair + Status: + description: allowed enum values + enum: + - UPLOADING + - PROCESSING + - READY + - ERROR + title: Status + type: string + ValidationError: + description: ValidationError + example: + msg: msg + loc: + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + properties: + loc: + items: + $ref: '#/components/schemas/ValidationErrorLocInner' + title: loc + type: array + msg: + title: Msg + type: string + type: + title: Type + type: string + required: + - loc + - msg + - type + title: ValidationError + ValidationErrorLocInner: + description: ValidationErrorLocInner + example: + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: "" + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + properties: + anyof_schema_1_validator: + nullable: true + title: anyof_schema_1_validator + type: string + anyof_schema_2_validator: + nullable: true + title: anyof_schema_2_validator + type: integer + actual_instance: + title: actual_instance + any_of_schemas: + items: + type: string + title: any_of_schemas + type: array + title: ValidationErrorLocInner diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 8835113..df6b1a3 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -74,3 +74,10 @@ async def upload_source( ) -> None: """Uploads user selected source.""" + async def upload_file( + self, + file: UploadFile, + request: Request, + ) -> object: + """Uploads user selected file.""" + ... diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py index 40f6013..7e288e1 100644 --- a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -78,6 +78,11 @@ def to_dict(self) -> Dict[str, Any]: if _item: _items.append(_item.to_dict()) _dict["detail"] = _items + # set to None if detail (nullable) is None + # and model_fields_set contains the field + if self.detail is None and "detail" in self.model_fields_set: + _dict["detail"] = None + return _dict @classmethod diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index e4ac64b..0ab750b 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -26,6 +26,10 @@ class Status(str, Enum): + """ + allowed enum values + """ + """ allowed enum values """ diff --git a/admin-api-lib/src/admin_api_lib/models/upload_source.py b/admin-api-lib/src/admin_api_lib/models/upload_source.py deleted file mode 100644 index 1d86e38..0000000 --- a/admin-api-lib/src/admin_api_lib/models/upload_source.py +++ /dev/null @@ -1,103 +0,0 @@ -# coding: utf-8 - -""" -admin-api-lib - -The API is used for the communication between the admin frontend and the admin backend in the rag project. - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -from __future__ import annotations -import pprint -import re # noqa: F401 -import json - -from fastapi import UploadFile -from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union -from admin_api_lib.models.key_value_pair import KeyValuePair - -try: - from typing import Self -except ImportError: - from typing_extensions import Self - - -class UploadSource(BaseModel): - """ """ # noqa: E501 - - type: StrictStr - name: StrictStr - kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["type", "name", "kwargs"] - - model_config = { - "populate_by_name": True, - "validate_assignment": True, - "protected_namespaces": (), - } - - def to_str(self) -> str: - """Returns the string representation of the model using alias""" - return pprint.pformat(self.model_dump(by_alias=True)) - - def to_json(self) -> str: - """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) - - @classmethod - def from_json(cls, json_str: str) -> Self: - """Create an instance of UploadSource from a JSON string""" - return cls.from_dict(json.loads(json_str)) - - def to_dict(self) -> Dict[str, Any]: - """Return the dictionary representation of the model using alias. - - This has the following differences from calling pydantic's - `self.model_dump(by_alias=True)`: - - * `None` is only added to the output dict for nullable fields that - were set at model initialization. Other fields with value `None` - are ignored. - """ - _dict = self.model_dump( - by_alias=True, - exclude={}, - exclude_none=True, - ) - # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) - _items = [] - if self.kwargs: - for _item in self.kwargs: - if _item: - _items.append(_item.to_dict()) - _dict["kwargs"] = _items - return _dict - - @classmethod - def from_dict(cls, obj: Dict) -> Self: - """Create an instance of UploadSource from a dict""" - if obj is None: - return None - - if not isinstance(obj, dict): - return cls.model_validate(obj) - - _obj = cls.model_validate( - { - "name": obj.get("name"), - "type": obj.get("type"), - "kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] - if obj.get("kwargs") is not None - else None - ), - } - ) - return _obj diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py index 8cd53fe..0100c88 100644 --- a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -13,141 +13,103 @@ from __future__ import annotations -from inspect import getfullargspec -import json import pprint import re # noqa: F401 +import json -from pydantic import BaseModel, ConfigDict, Field, StrictInt, StrictStr, ValidationError, field_validator -from typing import Optional -from typing import Union, Any, List, TYPE_CHECKING, Optional, Dict -from typing_extensions import Literal -from pydantic import StrictStr, Field +from pydantic import BaseModel, ConfigDict, StrictInt, StrictStr +from typing import Any, ClassVar, Dict, List, Optional try: from typing import Self except ImportError: from typing_extensions import Self -VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS = ["int", "str"] - class ValidationErrorLocInner(BaseModel): """ ValidationErrorLocInner - """ + """ # noqa: E501 - # data type: str anyof_schema_1_validator: Optional[StrictStr] = None - # data type: int anyof_schema_2_validator: Optional[StrictInt] = None - if TYPE_CHECKING: - actual_instance: Optional[Union[int, str]] = None - else: - actual_instance: Any = None - any_of_schemas: List[str] = Literal[VALIDATIONERRORLOCINNER_ANY_OF_SCHEMAS] + actual_instance: Optional[Any] = None + any_of_schemas: Optional[List[StrictStr]] = None + __properties: ClassVar[List[str]] = [ + "anyof_schema_1_validator", + "anyof_schema_2_validator", + "actual_instance", + "any_of_schemas", + ] model_config = { + "populate_by_name": True, "validate_assignment": True, "protected_namespaces": (), } - def __init__(self, *args, **kwargs) -> None: - if args: - if len(args) > 1: - raise ValueError("If a position argument is used, only 1 is allowed to set `actual_instance`") - if kwargs: - raise ValueError("If a position argument is used, keyword arguments cannot be used.") - super().__init__(actual_instance=args[0]) - else: - super().__init__(**kwargs) - - @field_validator("actual_instance") - def actual_instance_must_validate_anyof(cls, v): - instance = ValidationErrorLocInner.model_construct() - error_messages = [] - # validate data type: str - try: - instance.anyof_schema_1_validator = v - return v - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - # validate data type: int - try: - instance.anyof_schema_2_validator = v - return v - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - if error_messages: - # no match - raise ValueError( - "No match found when setting the actual_instance in ValidationErrorLocInner with anyOf schemas: int, str. Details: " - + ", ".join(error_messages) - ) - else: - return v + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) - @classmethod - def from_dict(cls, obj: dict) -> Self: - return cls.from_json(json.dumps(obj)) + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) @classmethod def from_json(cls, json_str: str) -> Self: - """Returns the object represented by the json string""" - instance = cls.model_construct() - error_messages = [] - # deserialize data into str - try: - # validation - instance.anyof_schema_1_validator = json.loads(json_str) - # assign value to actual_instance - instance.actual_instance = instance.anyof_schema_1_validator - return instance - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - # deserialize data into int - try: - # validation - instance.anyof_schema_2_validator = json.loads(json_str) - # assign value to actual_instance - instance.actual_instance = instance.anyof_schema_2_validator - return instance - except (ValidationError, ValueError) as e: - error_messages.append(str(e)) - - if error_messages: - # no match - raise ValueError( - "No match found when deserializing the JSON string into ValidationErrorLocInner with anyOf schemas: int, str. Details: " - + ", ".join(error_messages) - ) - else: - return instance - - def to_json(self) -> str: - """Returns the JSON representation of the actual instance""" - if self.actual_instance is None: - return "null" - - to_json = getattr(self.actual_instance, "to_json", None) - if callable(to_json): - return self.actual_instance.to_json() - else: - return json.dumps(self.actual_instance) - - def to_dict(self) -> Dict: - """Returns the dict representation of the actual instance""" - if self.actual_instance is None: - return "null" - - to_json = getattr(self.actual_instance, "to_json", None) - if callable(to_json): - return self.actual_instance.to_dict() - else: - # primitive type - return self.actual_instance + """Create an instance of ValidationErrorLocInner from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # set to None if anyof_schema_1_validator (nullable) is None + # and model_fields_set contains the field + if self.anyof_schema_1_validator is None and "anyof_schema_1_validator" in self.model_fields_set: + _dict["anyof_schema_1_validator"] = None + + # set to None if anyof_schema_2_validator (nullable) is None + # and model_fields_set contains the field + if self.anyof_schema_2_validator is None and "anyof_schema_2_validator" in self.model_fields_set: + _dict["anyof_schema_2_validator"] = None + + # set to None if actual_instance (nullable) is None + # and model_fields_set contains the field + if self.actual_instance is None and "actual_instance" in self.model_fields_set: + _dict["actual_instance"] = None + + return _dict - def to_str(self) -> str: - """Returns the string representation of the actual instance""" - return pprint.pformat(self.model_dump()) + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ValidationErrorLocInner from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "anyof_schema_1_validator": obj.get("anyof_schema_1_validator"), + "anyof_schema_2_validator": obj.get("anyof_schema_2_validator"), + "actual_instance": obj.get("actual_instance"), + "any_of_schemas": obj.get("any_of_schemas"), + } + ) + return _obj From 40625379663c82e977f7739dbab7658eec002001 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:43:10 +0200 Subject: [PATCH 09/56] wip --- .../api_endpoints/file_uploader.py | 14 ++ .../api_endpoints/source_uploader.py | 3 +- .../src/admin_api_lib/apis/admin_api.py | 9 +- .../src/admin_api_lib/apis/admin_api_base.py | 2 +- .../src/admin_api_lib/impl/admin_api.py | 16 +- .../api_endpoints/default_file_uploader.py | 152 ++++++++++++++++++ .../api_endpoints/default_source_uploader.py | 27 +--- 7 files changed, 191 insertions(+), 32 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py create mode 100644 admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py new file mode 100644 index 0000000..aaeea8f --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod + +from fastapi import UploadFile + + + +class FileUploader(ABC): + + @abstractmethod + async def upload_file( + self, + base_url: str, + file: UploadFile, + ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 0c9b73e..9cdd59e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -15,6 +15,5 @@ async def upload_source( base_url: str, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + kwargs: list[KeyValuePair], ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 15f8438..4fe1e15 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -134,7 +134,7 @@ async def get_all_documents_status() -> List[DocumentStatus]: @router.post( "/upload_file", responses={ - 200: {"model": object, "description": "ok"}, + 200: {"description": "ok"}, 400: {"description": "Bad request"}, 422: {"description": "Unprocessable Content"}, 500: {"description": "Internal server error"}, @@ -146,11 +146,11 @@ async def get_all_documents_status() -> List[DocumentStatus]: async def upload_file( file: UploadFile, request: Request, -) -> object: +) -> None: """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_file(file) + return await BaseAdminApi.subclasses[0]().upload_file(file, request) @router.post( @@ -166,7 +166,6 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - request: Request, type: StrictStr = Query(None, description="", alias="type"), name: StrictStr = Query(None, description="", alias="name"), @@ -175,4 +174,4 @@ async def upload_source( """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair) + return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair, request) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index df6b1a3..09d4d6d 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -78,6 +78,6 @@ async def upload_file( self, file: UploadFile, request: Request, - ) -> object: + ) -> None: """Uploads user selected file.""" ... diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index dd39f3c..3adbae1 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -93,12 +93,22 @@ async def upload_source( self, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], + kwargs: List[KeyValuePair], request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: - await source_uploader.upload_source(str(request.base_url), type, name, file, kwargs) + await source_uploader.upload_source(str(request.base_url), type, name, kwargs) + + + @inject + async def upload_file( + self, + file: UploadFile, + request: Request, + file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), + ) -> None: + await file_uploader.upload_source(str(request.base_url), file) + @inject async def document_reference_id_get( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py new file mode 100644 index 0000000..0dd5b4f --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -0,0 +1,152 @@ +from http.client import HTTPException +import logging +import os +from pathlib import Path +import traceback +from typing import Optional, Tuple, Union +from threading import Thread +import urllib +import tempfile +from urllib.request import Request + +from pydantic import StrictBytes, StrictStr +from fastapi import UploadFile, status +from langchain_core.documents import Document +from asyncio import run + +from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi +from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document +from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.chunker.chunker import Chunker +from admin_api_lib.models.status import Status +from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient +from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore +from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer +from admin_api_lib.utils.utils import sanitize_document_name + +logger = logging.getLogger(__name__) + + +class DefaultFileUploader(FileUploader): + + def __init__( + self, + extractor_api: ExtractorApiClient, + key_value_store: FileStatusKeyValueStore, + information_enhancer: InformationEnhancer, + chunker: Chunker, + document_deleter: DocumentDeleter, + rag_api: RagApi, + information_mapper: InformationPiece2Document, + ): + self._extractor_api = extractor_api + self._rag_api = rag_api + self._key_value_store = key_value_store + self._information_mapper = information_mapper + self._information_enhancer = information_enhancer + self._chunker = chunker + self._document_deleter = document_deleter + self._background_threads = [] + + async def upload_source( + self, + base_url: str, + file: UploadFile, + ) -> None: + self._background_threads = [t for t in self._background_threads if t.is_alive()] + + + try: + content = await file.read() + file.filename = sanitize_document_name(file.filename) + source_name = f"file:{sanitize_document_name(file.filename)}" + # TODO: check if document already in processing state + self._key_value_store.upsert( + source_name, Status.PROCESSING + ) # TODO: change to pipeline with timeout to error status + s3_path = await self._asave_new_document(content, file.filename, source_name) + thread = Thread( + target=lambda: run(self._handle_source_upload(s3_path,source_name, file.filename, base_url)) + ) + thread.start() + self._background_threads.append(thread) + except ValueError as e: + self._key_value_store.upsert(source_name, Status.ERROR) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + async def _handle_source_upload( + self, + s3_path:Path, + source_name: str, + file_name:str, + base_url: str, + ): + try: + information_pieces = self._extractor_api.extract(s3_path, source_name) + + if not information_pieces: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("No information pieces found in the document: %s", source_name) + documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + + chunked_documents = self._chunker.chunk(documents) + + enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + self._add_file_url(file_name,base_url,enhanced_documents) + + rag_information_pieces = [ + self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents + ] + # Replace old document + try: + await self._document_deleter.adelete_document(source_name) + except Exception as e: + # deletion is allowed to fail + pass + self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) + logger.info("Source uploaded successfully: %s", source_name) + except Exception as e: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) + + def _add_file_url( + self, file: UploadFile, base_url: str, chunked_documents: list[Document] + ): + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" + for idx, chunk in enumerate(chunked_documents): + if chunk.metadata["id"] in chunk.metadata["related"]: + chunk.metadata["related"].remove(chunk.metadata["id"]) + chunk.metadata.update( + { + "chunk": idx, + "chunk_length": len(chunk.page_content), + "document_url": document_url, + } + ) + + async def _asave_new_document( + self, + file_content: bytes, + filename: str, + source_name:str, + )->Path: + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / filename + with open(temp_file_path, "wb") as temp_file: + logger.debug("Temporary file created at %s.", temp_file_path) + temp_file.write(file_content) + logger.debug("Temp file created and content written.") + + self._file_service.upload_file(Path(temp_file_path), filename) + return Path(temp_file_path) + except Exception as e: + logger.error("Error during document saving: %s %s", e, traceback.format_exc()) + self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 196d8a7..5ef6a72 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -53,8 +53,7 @@ async def upload_source( base_url: str, type: StrictStr, name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + kwargs: list[KeyValuePair], ) -> None: self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{type}:{sanitize_document_name(name)}" @@ -62,15 +61,9 @@ async def upload_source( # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status - filename = None - if file: - content = await file.read() - filename = Path("/tmp/" + file.filename) - with open(filename, "wb") as tmpfile: - tmpfile.write(content) + ) # TODO: change to pipeline with timeout to error status thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, filename, kwargs)) + target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs)) ) thread.start() self._background_threads.append(thread) @@ -87,15 +80,10 @@ async def _handle_source_upload( source_name: str, base_url: str, type: StrictStr, - name: StrictStr, - file, #: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]], - kwargs: Optional[list[KeyValuePair]], + kwargs: list[KeyValuePair], ): try: - if file: - information_pieces = self._extractor_api.extract(type, source_name, str(file), kwargs) - else: - information_pieces = self._extractor_api.extract(type, source_name, None, kwargs) + information_pieces = self._extractor_api.extract(type, source_name, kwargs) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) @@ -103,7 +91,6 @@ async def _handle_source_upload( documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] chunked_documents = self._chunker.chunk(documents) - self._add_file_url(type, file, base_url, chunked_documents) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) rag_information_pieces = [ @@ -118,9 +105,7 @@ async def _handle_source_upload( pass self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) - logger.info("File uploaded successfully: %s", source_name) - if file: - os.remove(file) + logger.info("Source uploaded successfully: %s", source_name) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) From 96b6d101744ce360403b7d59d9ec691e8bd00159 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 13:53:54 +0200 Subject: [PATCH 10/56] wip --- .../api_endpoints/file_uploader.py | 3 +- .../src/admin_api_lib/apis/admin_api_base.py | 4 +- .../extractor_api_client.py | 50 -- .../extractor_api_client/models/__init__.py | 19 - .../openapi_client/__init__.py | 38 + .../openapi_client/api/__init__.py | 4 + .../openapi_client/api/extractor_api.py | 516 +++++++++++++ .../openapi_client/api_client.py | 695 ++++++++++++++++++ .../openapi_client/api_response.py | 20 + .../openapi_client/configuration.py | 460 ++++++++++++ .../openapi_client/exceptions.py | 197 +++++ .../openapi_client/models/__init__.py | 21 + .../models/content_type.py | 0 .../models/extraction_parameters.py | 103 +++ .../models/extraction_request.py | 82 +++ .../models/information_piece.py | 4 +- .../models/key_value_pair.py | 0 .../openapi_client/rest.py | 209 ++++++ .../openapi_client/test/__init__.py | 0 .../openapi_client/test/test_content_type.py | 35 + .../test/test_extraction_parameters.py | 59 ++ .../test/test_extraction_request.py | 56 ++ .../openapi_client/test/test_extractor_api.py | 39 + .../test/test_information_piece.py | 62 ++ .../test/test_key_value_pair.py | 54 ++ .../src/admin_api_lib/impl/admin_api.py | 2 - .../api_endpoints/default_file_uploader.py | 25 +- .../api_endpoints/default_source_uploader.py | 6 +- extractor-api-lib/openapi.yaml | 75 +- .../extractor_api_lib/apis/extractor_api.py | 45 +- .../apis/extractor_api_base.py | 19 +- .../extractors/information_file_extractor.py | 2 +- .../models/extraction_parameters.py | 105 +++ .../models/extraction_request.py | 31 +- 34 files changed, 2880 insertions(+), 160 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py delete mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py rename admin-api-lib/src/admin_api_lib/extractor_api_client/{ => openapi_client}/models/content_type.py (100%) create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py rename admin-api-lib/src/admin_api_lib/extractor_api_client/{ => openapi_client}/models/information_piece.py (94%) rename admin-api-lib/src/admin_api_lib/extractor_api_client/{ => openapi_client}/models/key_value_pair.py (100%) create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py create mode 100644 admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py create mode 100644 extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index aaeea8f..2a33545 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -3,12 +3,11 @@ from fastapi import UploadFile - class FileUploader(ABC): @abstractmethod async def upload_file( self, base_url: str, - file: UploadFile, + file: UploadFile, ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 09d4d6d..eb5ca84 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -21,7 +21,7 @@ def __init_subclass__(cls, **kwargs): async def delete_document( self, identification: StrictStr, - ) -> None: + ) -> None: """ Asynchronously deletes a document based on the provided identification. @@ -73,7 +73,7 @@ async def upload_source( request: Request, ) -> None: """Uploads user selected source.""" - + async def upload_file( self, file: UploadFile, diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py deleted file mode 100644 index 78ccbf7..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/extractor_api_client.py +++ /dev/null @@ -1,50 +0,0 @@ -import requests -from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece -from requests_toolbelt.multipart import MultipartEncoder - - -class ExtractorApiClient: - def __init__(self, base_url): - """ - Initialize the client with the base URL of the API. - - Args: - base_url (str): The base URL of the API. - """ - self.base_url = base_url - - def extract(self, type, name, file, kwargs=None): - """ - Send an extraction request to the API. - - Args: - file (str): The path to the file to extract from. - name (str): The name of the extraction request. - type (str): The type of extraction to perform. - kwargs (list): A list of key-value pairs to pass as additional arguments. - - Returns: - list: A list of extracted information pieces. - """ - with open(file, "rb") as openfile: - url = self.base_url + "/extract" - encoder = MultipartEncoder( - fields={ - "file": (file, openfile, "application/octet-stream"), - "name": name, - "type": type, - } - ) - if kwargs: - for pair in kwargs: - encoder.add_field(pair["key"], pair["value"]) - response = requests.post(url, headers={"Content-Type": encoder.content_type}, data=encoder) - if response.status_code == 200: - response_json = response.json() - return [InformationPiece.from_dict(x) for x in response_json] - elif response.status_code == 422: - raise ValueError("Invalid source") - elif response.status_code == 500: - raise Exception("Internal server error") - else: - raise Exception("Unknown error") diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py deleted file mode 100644 index 53560b6..0000000 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf-8 - -# flake8: noqa -""" -extractor-api-lib - -No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) - -The version of the OpenAPI document: 1.0.0 -Generated by OpenAPI Generator (https://openapi-generator.tech) - -Do not edit the class manually. -""" # noqa: E501 - - -# import models into model package -from admin_api_lib.extractor_api_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.models.information_piece import InformationPiece -from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py new file mode 100644 index 0000000..edf9fd4 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py @@ -0,0 +1,38 @@ +# coding: utf-8 + +# flake8: noqa + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +__version__ = "1.0.0" + +# import apis into sdk package +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + +# import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.exceptions import OpenApiException +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiTypeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiValueError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiKeyError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiAttributeError +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException + +# import models into sdk package +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py new file mode 100644 index 0000000..c95ce65 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py @@ -0,0 +1,4 @@ +# flake8: noqa + +# import apis into api package +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py new file mode 100644 index 0000000..1aaddf7 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py @@ -0,0 +1,516 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + +import warnings +from pydantic import validate_call, Field, StrictFloat, StrictStr, StrictInt +from typing import Any, Dict, List, Optional, Tuple, Union +from typing_extensions import Annotated + +from typing import List +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient, RequestSerialized +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse +from admin_api_lib.extractor_api_client.openapi_client.rest import RESTResponseType + + +class ExtractorApi: + """NOTE: This class is auto generated by OpenAPI Generator + Ref: https://openapi-generator.tech + + Do not edit the class manually. + """ + + def __init__(self, api_client=None) -> None: + if api_client is None: + api_client = ApiClient.get_default() + self.api_client = api_client + + @validate_call + def extract_from_file_post( + self, + extraction_request: ExtractionRequest, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> List[InformationPiece]: + """extract_from_file_post + + + :param extraction_request: (required) + :type extraction_request: ExtractionRequest + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ).data + + @validate_call + def extract_from_file_post_with_http_info( + self, + extraction_request: ExtractionRequest, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> ApiResponse[List[InformationPiece]]: + """extract_from_file_post + + + :param extraction_request: (required) + :type extraction_request: ExtractionRequest + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ) + + @validate_call + def extract_from_file_post_without_preload_content( + self, + extraction_request: ExtractionRequest, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> RESTResponseType: + """extract_from_file_post + + + :param extraction_request: (required) + :type extraction_request: ExtractionRequest + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_file_post_serialize( + extraction_request=extraction_request, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + return response_data.response + + def _extract_from_file_post_serialize( + self, + extraction_request, + _request_auth, + _content_type, + _headers, + _host_index, + ) -> RequestSerialized: + + _host = None + + _collection_formats: Dict[str, str] = {} + + _path_params: Dict[str, str] = {} + _query_params: List[Tuple[str, str]] = [] + _header_params: Dict[str, Optional[str]] = _headers or {} + _form_params: List[Tuple[str, str]] = [] + _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} + _body_params: Optional[bytes] = None + + # process the path parameters + # process the query parameters + # process the header parameters + # process the form parameters + # process the body parameter + if extraction_request is not None: + _body_params = extraction_request + + # set the HTTP header `Accept` + if "Accept" not in _header_params: + _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) + + # set the HTTP header `Content-Type` + if _content_type: + _header_params["Content-Type"] = _content_type + else: + _default_content_type = self.api_client.select_header_content_type(["application/json"]) + if _default_content_type is not None: + _header_params["Content-Type"] = _default_content_type + + # authentication setting + _auth_settings: List[str] = [] + + return self.api_client.param_serialize( + method="POST", + resource_path="/extract_from_file", + path_params=_path_params, + query_params=_query_params, + header_params=_header_params, + body=_body_params, + post_params=_form_params, + files=_files, + auth_settings=_auth_settings, + collection_formats=_collection_formats, + _host=_host, + _request_auth=_request_auth, + ) + + @validate_call + def extract_from_source( + self, + extraction_parameters: ExtractionParameters, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> List[InformationPiece]: + """extract_from_source + + + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "404": None, + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ).data + + @validate_call + def extract_from_source_with_http_info( + self, + extraction_parameters: ExtractionParameters, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> ApiResponse[List[InformationPiece]]: + """extract_from_source + + + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "404": None, + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + response_data.read() + return self.api_client.response_deserialize( + response_data=response_data, + response_types_map=_response_types_map, + ) + + @validate_call + def extract_from_source_without_preload_content( + self, + extraction_parameters: ExtractionParameters, + _request_timeout: Union[ + None, + Annotated[StrictFloat, Field(gt=0)], + Tuple[Annotated[StrictFloat, Field(gt=0)], Annotated[StrictFloat, Field(gt=0)]], + ] = None, + _request_auth: Optional[Dict[StrictStr, Any]] = None, + _content_type: Optional[StrictStr] = None, + _headers: Optional[Dict[StrictStr, Any]] = None, + _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0, + ) -> RESTResponseType: + """extract_from_source + + + :param extraction_parameters: (required) + :type extraction_parameters: ExtractionParameters + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + :type _request_timeout: int, tuple(int, int), optional + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the + authentication in the spec for a single request. + :type _request_auth: dict, optional + :param _content_type: force content-type for the request. + :type _content_type: str, Optional + :param _headers: set to override the headers for a single + request; this effectively ignores the headers + in the spec for a single request. + :type _headers: dict, optional + :param _host_index: set to override the host_index for a single + request; this effectively ignores the host_index + in the spec for a single request. + :type _host_index: int, optional + :return: Returns the result object. + """ # noqa: E501 + + _param = self._extract_from_source_serialize( + extraction_parameters=extraction_parameters, + _request_auth=_request_auth, + _content_type=_content_type, + _headers=_headers, + _host_index=_host_index, + ) + + _response_types_map: Dict[str, Optional[str]] = { + "200": "List[InformationPiece]", + "404": None, + "422": None, + "500": None, + } + response_data = self.api_client.call_api(*_param, _request_timeout=_request_timeout) + return response_data.response + + def _extract_from_source_serialize( + self, + extraction_parameters, + _request_auth, + _content_type, + _headers, + _host_index, + ) -> RequestSerialized: + + _host = None + + _collection_formats: Dict[str, str] = {} + + _path_params: Dict[str, str] = {} + _query_params: List[Tuple[str, str]] = [] + _header_params: Dict[str, Optional[str]] = _headers or {} + _form_params: List[Tuple[str, str]] = [] + _files: Dict[str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]] = {} + _body_params: Optional[bytes] = None + + # process the path parameters + # process the query parameters + # process the header parameters + # process the form parameters + # process the body parameter + if extraction_parameters is not None: + _body_params = extraction_parameters + + # set the HTTP header `Accept` + if "Accept" not in _header_params: + _header_params["Accept"] = self.api_client.select_header_accept(["application/json"]) + + # set the HTTP header `Content-Type` + if _content_type: + _header_params["Content-Type"] = _content_type + else: + _default_content_type = self.api_client.select_header_content_type(["application/json"]) + if _default_content_type is not None: + _header_params["Content-Type"] = _default_content_type + + # authentication setting + _auth_settings: List[str] = [] + + return self.api_client.param_serialize( + method="POST", + resource_path="/extract_from_source", + path_params=_path_params, + query_params=_query_params, + header_params=_header_params, + body=_body_params, + post_params=_form_params, + files=_files, + auth_settings=_auth_settings, + collection_formats=_collection_formats, + _host=_host, + _request_auth=_request_auth, + ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py new file mode 100644 index 0000000..ba8f5d2 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py @@ -0,0 +1,695 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import datetime +from dateutil.parser import parse +from enum import Enum +import decimal +import json +import mimetypes +import os +import re +import tempfile + +from urllib.parse import quote +from typing import Tuple, Optional, List, Dict, Union +from pydantic import SecretStr + +from admin_api_lib.extractor_api_client.openapi_client.configuration import Configuration +from admin_api_lib.extractor_api_client.openapi_client.api_response import ApiResponse, T as ApiResponseT +import admin_api_lib.extractor_api_client.openapi_client.models +from admin_api_lib.extractor_api_client.openapi_client import rest +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ( + ApiValueError, + ApiException, + BadRequestException, + UnauthorizedException, + ForbiddenException, + NotFoundException, + ServiceException, +) + +RequestSerialized = Tuple[str, str, Dict[str, str], Optional[str], List[str]] + + +class ApiClient: + """Generic API client for OpenAPI client library builds. + + OpenAPI generic API client. This client handles the client- + server communication, and is invariant across implementations. Specifics of + the methods and models for each application are generated from the OpenAPI + templates. + + :param configuration: .Configuration object for this client + :param header_name: a header to pass when making calls to the API. + :param header_value: a header value to pass when making calls to + the API. + :param cookie: a cookie to include in the header when making calls + to the API + """ + + PRIMITIVE_TYPES = (float, bool, bytes, str, int) + NATIVE_TYPES_MAPPING = { + "int": int, + "long": int, # TODO remove as only py3 is supported? + "float": float, + "str": str, + "bool": bool, + "date": datetime.date, + "datetime": datetime.datetime, + "decimal": decimal.Decimal, + "object": object, + } + _pool = None + + def __init__(self, configuration=None, header_name=None, header_value=None, cookie=None) -> None: + # use default configuration if none is provided + if configuration is None: + configuration = Configuration.get_default() + self.configuration = configuration + + self.rest_client = rest.RESTClientObject(configuration) + self.default_headers = {} + if header_name is not None: + self.default_headers[header_name] = header_value + self.cookie = cookie + # Set default User-Agent. + self.user_agent = "OpenAPI-Generator/1.0.0/python" + self.client_side_validation = configuration.client_side_validation + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + + @property + def user_agent(self): + """User agent for this API client""" + return self.default_headers["User-Agent"] + + @user_agent.setter + def user_agent(self, value): + self.default_headers["User-Agent"] = value + + def set_default_header(self, header_name, header_value): + self.default_headers[header_name] = header_value + + _default = None + + @classmethod + def get_default(cls): + """Return new instance of ApiClient. + + This method returns newly created, based on default constructor, + object of ApiClient class or returns a copy of default + ApiClient. + + :return: The ApiClient object. + """ + if cls._default is None: + cls._default = ApiClient() + return cls._default + + @classmethod + def set_default(cls, default): + """Set default instance of ApiClient. + + It stores default ApiClient. + + :param default: object of ApiClient. + """ + cls._default = default + + def param_serialize( + self, + method, + resource_path, + path_params=None, + query_params=None, + header_params=None, + body=None, + post_params=None, + files=None, + auth_settings=None, + collection_formats=None, + _host=None, + _request_auth=None, + ) -> RequestSerialized: + """Builds the HTTP request params needed by the request. + :param method: Method to call. + :param resource_path: Path to method endpoint. + :param path_params: Path parameters in the url. + :param query_params: Query parameters in the url. + :param header_params: Header parameters to be + placed in the request header. + :param body: Request body. + :param post_params dict: Request post form parameters, + for `application/x-www-form-urlencoded`, `multipart/form-data`. + :param auth_settings list: Auth Settings names for the request. + :param files dict: key -> filename, value -> filepath, + for `multipart/form-data`. + :param collection_formats: dict of collection formats for path, query, + header, and post parameters. + :param _request_auth: set to override the auth_settings for an a single + request; this effectively ignores the authentication + in the spec for a single request. + :return: tuple of form (path, http_method, query_params, header_params, + body, post_params, files) + """ + + config = self.configuration + + # header parameters + header_params = header_params or {} + header_params.update(self.default_headers) + if self.cookie: + header_params["Cookie"] = self.cookie + if header_params: + header_params = self.sanitize_for_serialization(header_params) + header_params = dict(self.parameters_to_tuples(header_params, collection_formats)) + + # path parameters + if path_params: + path_params = self.sanitize_for_serialization(path_params) + path_params = self.parameters_to_tuples(path_params, collection_formats) + for k, v in path_params: + # specified safe chars, encode everything + resource_path = resource_path.replace("{%s}" % k, quote(str(v), safe=config.safe_chars_for_path_param)) + + # post parameters + if post_params or files: + post_params = post_params if post_params else [] + post_params = self.sanitize_for_serialization(post_params) + post_params = self.parameters_to_tuples(post_params, collection_formats) + if files: + post_params.extend(self.files_parameters(files)) + + # auth setting + self.update_params_for_auth( + header_params, query_params, auth_settings, resource_path, method, body, request_auth=_request_auth + ) + + # body + if body: + body = self.sanitize_for_serialization(body) + + # request url + if _host is None or self.configuration.ignore_operation_servers: + url = self.configuration.host + resource_path + else: + # use server/host defined in path or operation instead + url = _host + resource_path + + # query parameters + if query_params: + query_params = self.sanitize_for_serialization(query_params) + url_query = self.parameters_to_url_query(query_params, collection_formats) + url += "?" + url_query + + return method, url, header_params, body, post_params + + def call_api( + self, method, url, header_params=None, body=None, post_params=None, _request_timeout=None + ) -> rest.RESTResponse: + """Makes the HTTP request (synchronous) + :param method: Method to call. + :param url: Path to method endpoint. + :param header_params: Header parameters to be + placed in the request header. + :param body: Request body. + :param post_params dict: Request post form parameters, + for `application/x-www-form-urlencoded`, `multipart/form-data`. + :param _request_timeout: timeout setting for this request. + :return: RESTResponse + """ + + try: + # perform request and return response + response_data = self.rest_client.request( + method, + url, + headers=header_params, + body=body, + post_params=post_params, + _request_timeout=_request_timeout, + ) + + except ApiException as e: + raise e + + return response_data + + def response_deserialize( + self, response_data: rest.RESTResponse, response_types_map: Optional[Dict[str, ApiResponseT]] = None + ) -> ApiResponse[ApiResponseT]: + """Deserializes response into an object. + :param response_data: RESTResponse object to be deserialized. + :param response_types_map: dict of response types. + :return: ApiResponse + """ + + msg = "RESTResponse.read() must be called before passing it to response_deserialize()" + assert response_data.data is not None, msg + + response_type = response_types_map.get(str(response_data.status), None) + if not response_type and isinstance(response_data.status, int) and 100 <= response_data.status <= 599: + # if not found, look for '1XX', '2XX', etc. + response_type = response_types_map.get(str(response_data.status)[0] + "XX", None) + + # deserialize response data + response_text = None + return_data = None + try: + if response_type == "bytearray": + return_data = response_data.data + elif response_type == "file": + return_data = self.__deserialize_file(response_data) + elif response_type is not None: + match = None + content_type = response_data.getheader("content-type") + if content_type is not None: + match = re.search(r"charset=([a-zA-Z\-\d]+)[\s;]?", content_type) + encoding = match.group(1) if match else "utf-8" + response_text = response_data.data.decode(encoding) + return_data = self.deserialize(response_text, response_type, content_type) + finally: + if not 200 <= response_data.status <= 299: + raise ApiException.from_response( + http_resp=response_data, + body=response_text, + data=return_data, + ) + + return ApiResponse( + status_code=response_data.status, + data=return_data, + headers=response_data.getheaders(), + raw_data=response_data.data, + ) + + def sanitize_for_serialization(self, obj): + """Builds a JSON POST object. + + If obj is None, return None. + If obj is SecretStr, return obj.get_secret_value() + If obj is str, int, long, float, bool, return directly. + If obj is datetime.datetime, datetime.date + convert to string in iso8601 format. + If obj is decimal.Decimal return string representation. + If obj is list, sanitize each element in the list. + If obj is dict, return the dict. + If obj is OpenAPI model, return the properties dict. + + :param obj: The data to serialize. + :return: The serialized form of data. + """ + if obj is None: + return None + elif isinstance(obj, Enum): + return obj.value + elif isinstance(obj, SecretStr): + return obj.get_secret_value() + elif isinstance(obj, self.PRIMITIVE_TYPES): + return obj + elif isinstance(obj, list): + return [self.sanitize_for_serialization(sub_obj) for sub_obj in obj] + elif isinstance(obj, tuple): + return tuple(self.sanitize_for_serialization(sub_obj) for sub_obj in obj) + elif isinstance(obj, (datetime.datetime, datetime.date)): + return obj.isoformat() + elif isinstance(obj, decimal.Decimal): + return str(obj) + + elif isinstance(obj, dict): + obj_dict = obj + else: + # Convert model obj to dict except + # attributes `openapi_types`, `attribute_map` + # and attributes which value is not None. + # Convert attribute name to json key in + # model definition for request. + if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): + obj_dict = obj.to_dict() + else: + obj_dict = obj.__dict__ + + return {key: self.sanitize_for_serialization(val) for key, val in obj_dict.items()} + + def deserialize(self, response_text: str, response_type: str, content_type: Optional[str]): + """Deserializes response into an object. + + :param response: RESTResponse object to be deserialized. + :param response_type: class literal for + deserialized object, or string of class name. + :param content_type: content type of response. + + :return: deserialized object. + """ + + # fetch data from response object + if content_type is None: + try: + data = json.loads(response_text) + except ValueError: + data = response_text + elif re.match(r"^application/(json|[\w!#$&.+-^_]+\+json)\s*(;|$)", content_type, re.IGNORECASE): + if response_text == "": + data = "" + else: + data = json.loads(response_text) + elif re.match(r"^text\/[a-z.+-]+\s*(;|$)", content_type, re.IGNORECASE): + data = response_text + else: + raise ApiException(status=0, reason="Unsupported content type: {0}".format(content_type)) + + return self.__deserialize(data, response_type) + + def __deserialize(self, data, klass): + """Deserializes dict, list, str into an object. + + :param data: dict, list or str. + :param klass: class literal, or string of class name. + + :return: object. + """ + if data is None: + return None + + if isinstance(klass, str): + if klass.startswith("List["): + m = re.match(r"List\[(.*)]", klass) + assert m is not None, "Malformed List type definition" + sub_kls = m.group(1) + return [self.__deserialize(sub_data, sub_kls) for sub_data in data] + + if klass.startswith("Dict["): + m = re.match(r"Dict\[([^,]*), (.*)]", klass) + assert m is not None, "Malformed Dict type definition" + sub_kls = m.group(2) + return {k: self.__deserialize(v, sub_kls) for k, v in data.items()} + + # convert str to class + if klass in self.NATIVE_TYPES_MAPPING: + klass = self.NATIVE_TYPES_MAPPING[klass] + else: + klass = getattr(admin_api_lib.extractor_api_client.openapi_client.models, klass) + + if klass in self.PRIMITIVE_TYPES: + return self.__deserialize_primitive(data, klass) + elif klass == object: + return self.__deserialize_object(data) + elif klass == datetime.date: + return self.__deserialize_date(data) + elif klass == datetime.datetime: + return self.__deserialize_datetime(data) + elif klass == decimal.Decimal: + return decimal.Decimal(data) + elif issubclass(klass, Enum): + return self.__deserialize_enum(data, klass) + else: + return self.__deserialize_model(data, klass) + + def parameters_to_tuples(self, params, collection_formats): + """Get parameters as list of tuples, formatting collections. + + :param params: Parameters as dict or list of two-tuples + :param dict collection_formats: Parameter collection formats + :return: Parameters as list of tuples, collections formatted + """ + new_params: List[Tuple[str, str]] = [] + if collection_formats is None: + collection_formats = {} + for k, v in params.items() if isinstance(params, dict) else params: + if k in collection_formats: + collection_format = collection_formats[k] + if collection_format == "multi": + new_params.extend((k, value) for value in v) + else: + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" + else: # csv is the default + delimiter = "," + new_params.append((k, delimiter.join(str(value) for value in v))) + else: + new_params.append((k, v)) + return new_params + + def parameters_to_url_query(self, params, collection_formats): + """Get parameters as list of tuples, formatting collections. + + :param params: Parameters as dict or list of two-tuples + :param dict collection_formats: Parameter collection formats + :return: URL query string (e.g. a=Hello%20World&b=123) + """ + new_params: List[Tuple[str, str]] = [] + if collection_formats is None: + collection_formats = {} + for k, v in params.items() if isinstance(params, dict) else params: + if isinstance(v, bool): + v = str(v).lower() + if isinstance(v, (int, float)): + v = str(v) + if isinstance(v, dict): + v = json.dumps(v) + + if k in collection_formats: + collection_format = collection_formats[k] + if collection_format == "multi": + new_params.extend((k, str(value)) for value in v) + else: + if collection_format == "ssv": + delimiter = " " + elif collection_format == "tsv": + delimiter = "\t" + elif collection_format == "pipes": + delimiter = "|" + else: # csv is the default + delimiter = "," + new_params.append((k, delimiter.join(quote(str(value)) for value in v))) + else: + new_params.append((k, quote(str(v)))) + + return "&".join(["=".join(map(str, item)) for item in new_params]) + + def files_parameters( + self, + files: Dict[str, Union[str, bytes, List[str], List[bytes], Tuple[str, bytes]]], + ): + """Builds form parameters. + + :param files: File parameters. + :return: Form parameters with files. + """ + params = [] + for k, v in files.items(): + if isinstance(v, str): + with open(v, "rb") as f: + filename = os.path.basename(f.name) + filedata = f.read() + elif isinstance(v, bytes): + filename = k + filedata = v + elif isinstance(v, tuple): + filename, filedata = v + elif isinstance(v, list): + for file_param in v: + params.extend(self.files_parameters({k: file_param})) + continue + else: + raise ValueError("Unsupported file value") + mimetype = mimetypes.guess_type(filename)[0] or "application/octet-stream" + params.append(tuple([k, tuple([filename, filedata, mimetype])])) + return params + + def select_header_accept(self, accepts: List[str]) -> Optional[str]: + """Returns `Accept` based on an array of accepts provided. + + :param accepts: List of headers. + :return: Accept (e.g. application/json). + """ + if not accepts: + return None + + for accept in accepts: + if re.search("json", accept, re.IGNORECASE): + return accept + + return accepts[0] + + def select_header_content_type(self, content_types): + """Returns `Content-Type` based on an array of content_types provided. + + :param content_types: List of content-types. + :return: Content-Type (e.g. application/json). + """ + if not content_types: + return None + + for content_type in content_types: + if re.search("json", content_type, re.IGNORECASE): + return content_type + + return content_types[0] + + def update_params_for_auth( + self, headers, queries, auth_settings, resource_path, method, body, request_auth=None + ) -> None: + """Updates header and query params based on authentication setting. + + :param headers: Header parameters dict to be updated. + :param queries: Query parameters tuple list to be updated. + :param auth_settings: Authentication setting identifiers list. + :resource_path: A string representation of the HTTP request resource path. + :method: A string representation of the HTTP request method. + :body: A object representing the body of the HTTP request. + The object type is the return value of sanitize_for_serialization(). + :param request_auth: if set, the provided settings will + override the token in the configuration. + """ + if not auth_settings: + return + + if request_auth: + self._apply_auth_params(headers, queries, resource_path, method, body, request_auth) + else: + for auth in auth_settings: + auth_setting = self.configuration.auth_settings().get(auth) + if auth_setting: + self._apply_auth_params(headers, queries, resource_path, method, body, auth_setting) + + def _apply_auth_params(self, headers, queries, resource_path, method, body, auth_setting) -> None: + """Updates the request parameters based on a single auth_setting + + :param headers: Header parameters dict to be updated. + :param queries: Query parameters tuple list to be updated. + :resource_path: A string representation of the HTTP request resource path. + :method: A string representation of the HTTP request method. + :body: A object representing the body of the HTTP request. + The object type is the return value of sanitize_for_serialization(). + :param auth_setting: auth settings for the endpoint + """ + if auth_setting["in"] == "cookie": + headers["Cookie"] = auth_setting["value"] + elif auth_setting["in"] == "header": + if auth_setting["type"] != "http-signature": + headers[auth_setting["key"]] = auth_setting["value"] + elif auth_setting["in"] == "query": + queries.append((auth_setting["key"], auth_setting["value"])) + else: + raise ApiValueError("Authentication token must be in `query` or `header`") + + def __deserialize_file(self, response): + """Deserializes body to file + + Saves response body into a file in a temporary folder, + using the filename from the `Content-Disposition` header if provided. + + handle file downloading + save response body into a tmp file and return the instance + + :param response: RESTResponse. + :return: file path. + """ + fd, path = tempfile.mkstemp(dir=self.configuration.temp_folder_path) + os.close(fd) + os.remove(path) + + content_disposition = response.getheader("Content-Disposition") + if content_disposition: + m = re.search(r'filename=[\'"]?([^\'"\s]+)[\'"]?', content_disposition) + assert m is not None, "Unexpected 'content-disposition' header value" + filename = m.group(1) + path = os.path.join(os.path.dirname(path), filename) + + with open(path, "wb") as f: + f.write(response.data) + + return path + + def __deserialize_primitive(self, data, klass): + """Deserializes string to primitive type. + + :param data: str. + :param klass: class literal. + + :return: int, long, float, str, bool. + """ + try: + return klass(data) + except UnicodeEncodeError: + return str(data) + except TypeError: + return data + + def __deserialize_object(self, value): + """Return an original value. + + :return: object. + """ + return value + + def __deserialize_date(self, string): + """Deserializes string to date. + + :param string: str. + :return: date. + """ + try: + return parse(string).date() + except ImportError: + return string + except ValueError: + raise rest.ApiException(status=0, reason="Failed to parse `{0}` as date object".format(string)) + + def __deserialize_datetime(self, string): + """Deserializes string to datetime. + + The string should be in iso8601 datetime format. + + :param string: str. + :return: datetime. + """ + try: + return parse(string) + except ImportError: + return string + except ValueError: + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as datetime object".format(string))) + + def __deserialize_enum(self, data, klass): + """Deserializes primitive type to enum. + + :param data: primitive type. + :param klass: class literal. + :return: enum value. + """ + try: + return klass(data) + except ValueError: + raise rest.ApiException(status=0, reason=("Failed to parse `{0}` as `{1}`".format(data, klass))) + + def __deserialize_model(self, data, klass): + """Deserializes list or dict to model. + + :param data: dict, list. + :param klass: class literal. + :return: model object. + """ + + return klass.from_dict(data) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py new file mode 100644 index 0000000..1ce1372 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py @@ -0,0 +1,20 @@ +"""API response object.""" + +from __future__ import annotations +from typing import Optional, Generic, Mapping, TypeVar +from pydantic import Field, StrictInt, StrictBytes, BaseModel + +T = TypeVar("T") + + +class ApiResponse(BaseModel, Generic[T]): + """ + API response object + """ + + status_code: StrictInt = Field(description="HTTP status code") + headers: Optional[Mapping[str, str]] = Field(None, description="HTTP headers") + data: T = Field(description="Deserialized data given the data type") + raw_data: StrictBytes = Field(description="Raw data (HTTP response body)") + + model_config = {"arbitrary_types_allowed": True} diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py new file mode 100644 index 0000000..2e80369 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py @@ -0,0 +1,460 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import copy +import logging +from logging import FileHandler +import multiprocessing +import sys +from typing import Optional +import urllib3 + +import http.client as httplib + +JSON_SCHEMA_VALIDATION_KEYWORDS = { + "multipleOf", + "maximum", + "exclusiveMaximum", + "minimum", + "exclusiveMinimum", + "maxLength", + "minLength", + "pattern", + "maxItems", + "minItems", +} + + +class Configuration: + """This class contains various settings of the API client. + + :param host: Base url. + :param ignore_operation_servers + Boolean to ignore operation servers for the API client. + Config will use `host` as the base url regardless of the operation servers. + :param api_key: Dict to store API key(s). + Each entry in the dict specifies an API key. + The dict key is the name of the security scheme in the OAS specification. + The dict value is the API key secret. + :param api_key_prefix: Dict to store API prefix (e.g. Bearer). + The dict key is the name of the security scheme in the OAS specification. + The dict value is an API key prefix when generating the auth data. + :param username: Username for HTTP basic authentication. + :param password: Password for HTTP basic authentication. + :param access_token: Access token. + :param server_index: Index to servers configuration. + :param server_variables: Mapping with string values to replace variables in + templated server configuration. The validation of enums is performed for + variables with defined enum values before. + :param server_operation_index: Mapping from operation ID to an index to server + configuration. + :param server_operation_variables: Mapping from operation ID to a mapping with + string values to replace variables in templated server configuration. + The validation of enums is performed for variables with defined enum + values before. + :param ssl_ca_cert: str - the path to a file of concatenated CA certificates + in PEM format. + :param retries: Number of retries for API requests. + + """ + + _default = None + + def __init__( + self, + host=None, + api_key=None, + api_key_prefix=None, + username=None, + password=None, + access_token=None, + server_index=None, + server_variables=None, + server_operation_index=None, + server_operation_variables=None, + ignore_operation_servers=False, + ssl_ca_cert=None, + retries=None, + *, + debug: Optional[bool] = None + ) -> None: + """Constructor""" + self._base_path = "http://localhost" if host is None else host + """Default Base url + """ + self.server_index = 0 if server_index is None and host is None else server_index + self.server_operation_index = server_operation_index or {} + """Default server index + """ + self.server_variables = server_variables or {} + self.server_operation_variables = server_operation_variables or {} + """Default server variables + """ + self.ignore_operation_servers = ignore_operation_servers + """Ignore operation servers + """ + self.temp_folder_path = None + """Temp file folder for downloading files + """ + # Authentication Settings + self.api_key = {} + if api_key: + self.api_key = api_key + """dict to store API key(s) + """ + self.api_key_prefix = {} + if api_key_prefix: + self.api_key_prefix = api_key_prefix + """dict to store API prefix (e.g. Bearer) + """ + self.refresh_api_key_hook = None + """function hook to refresh API key if expired + """ + self.username = username + """Username for HTTP basic authentication + """ + self.password = password + """Password for HTTP basic authentication + """ + self.access_token = access_token + """Access token + """ + self.logger = {} + """Logging Settings + """ + self.logger["package_logger"] = logging.getLogger("admin_api_lib.extractor_api_client.openapi_client") + self.logger["urllib3_logger"] = logging.getLogger("urllib3") + self.logger_format = "%(asctime)s %(levelname)s %(message)s" + """Log format + """ + self.logger_stream_handler = None + """Log stream handler + """ + self.logger_file_handler: Optional[FileHandler] = None + """Log file handler + """ + self.logger_file = None + """Debug file location + """ + if debug is not None: + self.debug = debug + else: + self.__debug = False + """Debug switch + """ + + self.verify_ssl = True + """SSL/TLS verification + Set this to false to skip verifying SSL certificate when calling API + from https server. + """ + self.ssl_ca_cert = ssl_ca_cert + """Set this to customize the certificate file to verify the peer. + """ + self.cert_file = None + """client certificate file + """ + self.key_file = None + """client key file + """ + self.assert_hostname = None + """Set this to True/False to enable/disable SSL hostname verification. + """ + self.tls_server_name = None + """SSL/TLS Server Name Indication (SNI) + Set this to the SNI value expected by the server. + """ + + self.connection_pool_maxsize = multiprocessing.cpu_count() * 5 + """urllib3 connection pool's maximum number of connections saved + per pool. urllib3 uses 1 connection as default value, but this is + not the best value when you are making a lot of possibly parallel + requests to the same host, which is often the case here. + cpu_count * 5 is used as default value to increase performance. + """ + + self.proxy: Optional[str] = None + """Proxy URL + """ + self.proxy_headers = None + """Proxy headers + """ + self.safe_chars_for_path_param = "" + """Safe chars for path_param + """ + self.retries = retries + """Adding retries to override urllib3 default value 3 + """ + # Enable client side validation + self.client_side_validation = True + + self.socket_options = None + """Options to pass down to the underlying urllib3 socket + """ + + self.datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" + """datetime format + """ + + self.date_format = "%Y-%m-%d" + """date format + """ + + def __deepcopy__(self, memo): + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + if k not in ("logger", "logger_file_handler"): + setattr(result, k, copy.deepcopy(v, memo)) + # shallow copy of loggers + result.logger = copy.copy(self.logger) + # use setters to configure loggers + result.logger_file = self.logger_file + result.debug = self.debug + return result + + def __setattr__(self, name, value): + object.__setattr__(self, name, value) + + @classmethod + def set_default(cls, default): + """Set default instance of configuration. + + It stores default configuration, which can be + returned by get_default_copy method. + + :param default: object of Configuration + """ + cls._default = default + + @classmethod + def get_default_copy(cls): + """Deprecated. Please use `get_default` instead. + + Deprecated. Please use `get_default` instead. + + :return: The configuration object. + """ + return cls.get_default() + + @classmethod + def get_default(cls): + """Return the default configuration. + + This method returns newly created, based on default constructor, + object of Configuration class or returns a copy of default + configuration. + + :return: The configuration object. + """ + if cls._default is None: + cls._default = Configuration() + return cls._default + + @property + def logger_file(self): + """The logger file. + + If the logger_file is None, then add stream handler and remove file + handler. Otherwise, add file handler and remove stream handler. + + :param value: The logger_file path. + :type: str + """ + return self.__logger_file + + @logger_file.setter + def logger_file(self, value): + """The logger file. + + If the logger_file is None, then add stream handler and remove file + handler. Otherwise, add file handler and remove stream handler. + + :param value: The logger_file path. + :type: str + """ + self.__logger_file = value + if self.__logger_file: + # If set logging file, + # then add file handler and remove stream handler. + self.logger_file_handler = logging.FileHandler(self.__logger_file) + self.logger_file_handler.setFormatter(self.logger_formatter) + for _, logger in self.logger.items(): + logger.addHandler(self.logger_file_handler) + + @property + def debug(self): + """Debug status + + :param value: The debug status, True or False. + :type: bool + """ + return self.__debug + + @debug.setter + def debug(self, value): + """Debug status + + :param value: The debug status, True or False. + :type: bool + """ + self.__debug = value + if self.__debug: + # if debug status is True, turn on debug logging + for _, logger in self.logger.items(): + logger.setLevel(logging.DEBUG) + # turn on httplib debug + httplib.HTTPConnection.debuglevel = 1 + else: + # if debug status is False, turn off debug logging, + # setting log level to default `logging.WARNING` + for _, logger in self.logger.items(): + logger.setLevel(logging.WARNING) + # turn off httplib debug + httplib.HTTPConnection.debuglevel = 0 + + @property + def logger_format(self): + """The logger format. + + The logger_formatter will be updated when sets logger_format. + + :param value: The format string. + :type: str + """ + return self.__logger_format + + @logger_format.setter + def logger_format(self, value): + """The logger format. + + The logger_formatter will be updated when sets logger_format. + + :param value: The format string. + :type: str + """ + self.__logger_format = value + self.logger_formatter = logging.Formatter(self.__logger_format) + + def get_api_key_with_prefix(self, identifier, alias=None): + """Gets API key (with prefix if set). + + :param identifier: The identifier of apiKey. + :param alias: The alternative identifier of apiKey. + :return: The token for api key authentication. + """ + if self.refresh_api_key_hook is not None: + self.refresh_api_key_hook(self) + key = self.api_key.get(identifier, self.api_key.get(alias) if alias is not None else None) + if key: + prefix = self.api_key_prefix.get(identifier) + if prefix: + return "%s %s" % (prefix, key) + else: + return key + + def get_basic_auth_token(self): + """Gets HTTP basic authentication header (string). + + :return: The token for basic HTTP authentication. + """ + username = "" + if self.username is not None: + username = self.username + password = "" + if self.password is not None: + password = self.password + return urllib3.util.make_headers(basic_auth=username + ":" + password).get("authorization") + + def auth_settings(self): + """Gets Auth Settings dict for api client. + + :return: The Auth Settings information dict. + """ + auth = {} + return auth + + def to_debug_report(self): + """Gets the essential information for debugging. + + :return: The report for debugging. + """ + return ( + "Python SDK Debug Report:\n" + "OS: {env}\n" + "Python Version: {pyversion}\n" + "Version of the API: 1.0.0\n" + "SDK Package Version: 1.0.0".format(env=sys.platform, pyversion=sys.version) + ) + + def get_host_settings(self): + """Gets an array of host settings + + :return: An array of host settings + """ + return [ + { + "url": "", + "description": "No description provided", + } + ] + + def get_host_from_settings(self, index, variables=None, servers=None): + """Gets host URL based on the index and variables + :param index: array index of the host settings + :param variables: hash of variable and the corresponding value + :param servers: an array of host settings or None + :return: URL based on host settings + """ + if index is None: + return self._base_path + + variables = {} if variables is None else variables + servers = self.get_host_settings() if servers is None else servers + + try: + server = servers[index] + except IndexError: + raise ValueError( + "Invalid index {0} when selecting the host settings. " + "Must be less than {1}".format(index, len(servers)) + ) + + url = server["url"] + + # go through variables and replace placeholders + for variable_name, variable in server.get("variables", {}).items(): + used_value = variables.get(variable_name, variable["default_value"]) + + if "enum_values" in variable and used_value not in variable["enum_values"]: + raise ValueError( + "The variable `{0}` in the host URL has invalid value " + "{1}. Must be {2}.".format(variable_name, variables[variable_name], variable["enum_values"]) + ) + + url = url.replace("{" + variable_name + "}", used_value) + + return url + + @property + def host(self): + """Return generated host.""" + return self.get_host_from_settings(self.server_index, variables=self.server_variables) + + @host.setter + def host(self, value): + """Fix base path.""" + self._base_path = value + self.server_index = None diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py new file mode 100644 index 0000000..5dbd4b0 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py @@ -0,0 +1,197 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + +from typing import Any, Optional +from typing_extensions import Self + + +class OpenApiException(Exception): + """The base exception class for all OpenAPIExceptions""" + + +class ApiTypeError(OpenApiException, TypeError): + def __init__(self, msg, path_to_item=None, valid_classes=None, key_type=None) -> None: + """Raises an exception for TypeErrors + + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (list): a list of keys an indices to get to the + current_item + None if unset + valid_classes (tuple): the primitive classes that current item + should be an instance of + None if unset + key_type (bool): False if our value is a value in a dict + True if it is a key in a dict + False if our item is an item in a list + None if unset + """ + self.path_to_item = path_to_item + self.valid_classes = valid_classes + self.key_type = key_type + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiTypeError, self).__init__(full_msg) + + +class ApiValueError(OpenApiException, ValueError): + def __init__(self, msg, path_to_item=None) -> None: + """ + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (list) the path to the exception in the + received_data dict. None if unset + """ + + self.path_to_item = path_to_item + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiValueError, self).__init__(full_msg) + + +class ApiAttributeError(OpenApiException, AttributeError): + def __init__(self, msg, path_to_item=None) -> None: + """ + Raised when an attribute reference or assignment fails. + + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (None/list) the path to the exception in the + received_data dict + """ + self.path_to_item = path_to_item + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiAttributeError, self).__init__(full_msg) + + +class ApiKeyError(OpenApiException, KeyError): + def __init__(self, msg, path_to_item=None) -> None: + """ + Args: + msg (str): the exception message + + Keyword Args: + path_to_item (None/list) the path to the exception in the + received_data dict + """ + self.path_to_item = path_to_item + full_msg = msg + if path_to_item: + full_msg = "{0} at {1}".format(msg, render_path(path_to_item)) + super(ApiKeyError, self).__init__(full_msg) + + +class ApiException(OpenApiException): + + def __init__( + self, + status=None, + reason=None, + http_resp=None, + *, + body: Optional[str] = None, + data: Optional[Any] = None, + ) -> None: + self.status = status + self.reason = reason + self.body = body + self.data = data + self.headers = None + + if http_resp: + if self.status is None: + self.status = http_resp.status + if self.reason is None: + self.reason = http_resp.reason + if self.body is None: + try: + self.body = http_resp.data.decode("utf-8") + except Exception: + pass + self.headers = http_resp.getheaders() + + @classmethod + def from_response( + cls, + *, + http_resp, + body: Optional[str], + data: Optional[Any], + ) -> Self: + if http_resp.status == 400: + raise BadRequestException(http_resp=http_resp, body=body, data=data) + + if http_resp.status == 401: + raise UnauthorizedException(http_resp=http_resp, body=body, data=data) + + if http_resp.status == 403: + raise ForbiddenException(http_resp=http_resp, body=body, data=data) + + if http_resp.status == 404: + raise NotFoundException(http_resp=http_resp, body=body, data=data) + + if 500 <= http_resp.status <= 599: + raise ServiceException(http_resp=http_resp, body=body, data=data) + raise ApiException(http_resp=http_resp, body=body, data=data) + + def __str__(self): + """Custom error messages for exception""" + error_message = "({0})\n" "Reason: {1}\n".format(self.status, self.reason) + if self.headers: + error_message += "HTTP response headers: {0}\n".format(self.headers) + + if self.data or self.body: + error_message += "HTTP response body: {0}\n".format(self.data or self.body) + + return error_message + + +class BadRequestException(ApiException): + pass + + +class NotFoundException(ApiException): + pass + + +class UnauthorizedException(ApiException): + pass + + +class ForbiddenException(ApiException): + pass + + +class ServiceException(ApiException): + pass + + +def render_path(path_to_item): + """Returns a string representation of a path""" + result = "" + for pth in path_to_item: + if isinstance(pth, int): + result += "[{0}]".format(pth) + else: + result += "['{0}']".format(pth) + return result diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py new file mode 100644 index 0000000..ad02f00 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py @@ -0,0 +1,21 @@ +# coding: utf-8 + +# flake8: noqa +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +# import models into model package +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/models/content_type.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py new file mode 100644 index 0000000..da4408d --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -0,0 +1,103 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair +from typing import Optional, Set +from typing_extensions import Self + + +class ExtractionParameters(BaseModel): + """ """ # noqa: E501 + + document_name: StrictStr = Field( + description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." + ) + confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + + model_config = ConfigDict( + populate_by_name=True, + validate_assignment=True, + protected_namespaces=(), + ) + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Optional[Self]: + """Create an instance of ExtractionParameters from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + excluded_fields: Set[str] = set([]) + + _dict = self.model_dump( + by_alias=True, + exclude=excluded_fields, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + _items = [] + if self.confluence_kwargs: + for _item_confluence_kwargs in self.confluence_kwargs: + if _item_confluence_kwargs: + _items.append(_item_confluence_kwargs.to_dict()) + _dict["confluence_kwargs"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: + """Create an instance of ExtractionParameters from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "document_name": obj.get("document_name"), + "confluence_kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] + if obj.get("confluence_kwargs") is not None + else None + ), + "type": obj.get("type"), + } + ) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py new file mode 100644 index 0000000..8bcfb3c --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py @@ -0,0 +1,82 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List +from typing import Optional, Set +from typing_extensions import Self + + +class ExtractionRequest(BaseModel): + """ """ # noqa: E501 + + path_on_s3: StrictStr + document_name: StrictStr + __properties: ClassVar[List[str]] = ["path_on_s3", "document_name"] + + model_config = ConfigDict( + populate_by_name=True, + validate_assignment=True, + protected_namespaces=(), + ) + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Optional[Self]: + """Create an instance of ExtractionRequest from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + excluded_fields: Set[str] = set([]) + + _dict = self.model_dump( + by_alias=True, + exclude=excluded_fields, + exclude_none=True, + ) + return _dict + + @classmethod + def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: + """Create an instance of ExtractionRequest from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3"), "document_name": obj.get("document_name")}) + return _obj diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py similarity index 94% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py index 99c3ee2..a428183 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/information_piece.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py @@ -19,8 +19,8 @@ from pydantic import BaseModel, ConfigDict, StrictStr from typing import Any, ClassVar, Dict, List -from admin_api_lib.extractor_api_client.models.content_type import ContentType -from admin_api_lib.extractor_api_client.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair from typing import Optional, Set from typing_extensions import Self diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py similarity index 100% rename from admin-api-lib/src/admin_api_lib/extractor_api_client/models/key_value_pair.py rename to admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py new file mode 100644 index 0000000..60fc660 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/rest.py @@ -0,0 +1,209 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import io +import json +import re +import ssl + +import urllib3 + +from admin_api_lib.extractor_api_client.openapi_client.exceptions import ApiException, ApiValueError + +SUPPORTED_SOCKS_PROXIES = {"socks5", "socks5h", "socks4", "socks4a"} +RESTResponseType = urllib3.HTTPResponse + + +def is_socks_proxy_url(url): + if url is None: + return False + split_section = url.split("://") + if len(split_section) < 2: + return False + else: + return split_section[0].lower() in SUPPORTED_SOCKS_PROXIES + + +class RESTResponse(io.IOBase): + + def __init__(self, resp) -> None: + self.response = resp + self.status = resp.status + self.reason = resp.reason + self.data = None + + def read(self): + if self.data is None: + self.data = self.response.data + return self.data + + def getheaders(self): + """Returns a dictionary of the response headers.""" + return self.response.headers + + def getheader(self, name, default=None): + """Returns a given response header.""" + return self.response.headers.get(name, default) + + +class RESTClientObject: + + def __init__(self, configuration) -> None: + # urllib3.PoolManager will pass all kw parameters to connectionpool + # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/poolmanager.py#L75 # noqa: E501 + # https://github.com/shazow/urllib3/blob/f9409436f83aeb79fbaf090181cd81b784f1b8ce/urllib3/connectionpool.py#L680 # noqa: E501 + # Custom SSL certificates and client certificates: http://urllib3.readthedocs.io/en/latest/advanced-usage.html # noqa: E501 + + # cert_reqs + if configuration.verify_ssl: + cert_reqs = ssl.CERT_REQUIRED + else: + cert_reqs = ssl.CERT_NONE + + pool_args = { + "cert_reqs": cert_reqs, + "ca_certs": configuration.ssl_ca_cert, + "cert_file": configuration.cert_file, + "key_file": configuration.key_file, + } + if configuration.assert_hostname is not None: + pool_args["assert_hostname"] = configuration.assert_hostname + + if configuration.retries is not None: + pool_args["retries"] = configuration.retries + + if configuration.tls_server_name: + pool_args["server_hostname"] = configuration.tls_server_name + + if configuration.socket_options is not None: + pool_args["socket_options"] = configuration.socket_options + + if configuration.connection_pool_maxsize is not None: + pool_args["maxsize"] = configuration.connection_pool_maxsize + + # https pool manager + self.pool_manager: urllib3.PoolManager + + if configuration.proxy: + if is_socks_proxy_url(configuration.proxy): + from urllib3.contrib.socks import SOCKSProxyManager + + pool_args["proxy_url"] = configuration.proxy + pool_args["headers"] = configuration.proxy_headers + self.pool_manager = SOCKSProxyManager(**pool_args) + else: + pool_args["proxy_url"] = configuration.proxy + pool_args["proxy_headers"] = configuration.proxy_headers + self.pool_manager = urllib3.ProxyManager(**pool_args) + else: + self.pool_manager = urllib3.PoolManager(**pool_args) + + def request(self, method, url, headers=None, body=None, post_params=None, _request_timeout=None): + """Perform requests. + + :param method: http request method + :param url: http request url + :param headers: http request headers + :param body: request json body, for `application/json` + :param post_params: request post parameters, + `application/x-www-form-urlencoded` + and `multipart/form-data` + :param _request_timeout: timeout setting for this request. If one + number provided, it will be total request + timeout. It can also be a pair (tuple) of + (connection, read) timeouts. + """ + method = method.upper() + assert method in ["GET", "HEAD", "DELETE", "POST", "PUT", "PATCH", "OPTIONS"] + + if post_params and body: + raise ApiValueError("body parameter cannot be used with post_params parameter.") + + post_params = post_params or {} + headers = headers or {} + + timeout = None + if _request_timeout: + if isinstance(_request_timeout, (int, float)): + timeout = urllib3.Timeout(total=_request_timeout) + elif isinstance(_request_timeout, tuple) and len(_request_timeout) == 2: + timeout = urllib3.Timeout(connect=_request_timeout[0], read=_request_timeout[1]) + + try: + # For `POST`, `PUT`, `PATCH`, `OPTIONS`, `DELETE` + if method in ["POST", "PUT", "PATCH", "OPTIONS", "DELETE"]: + + # no content type provided or payload is json + content_type = headers.get("Content-Type") + if not content_type or re.search("json", content_type, re.IGNORECASE): + request_body = None + if body is not None: + request_body = json.dumps(body) + r = self.pool_manager.request( + method, url, body=request_body, timeout=timeout, headers=headers, preload_content=False + ) + elif content_type == "application/x-www-form-urlencoded": + r = self.pool_manager.request( + method, + url, + fields=post_params, + encode_multipart=False, + timeout=timeout, + headers=headers, + preload_content=False, + ) + elif content_type == "multipart/form-data": + # must del headers['Content-Type'], or the correct + # Content-Type which generated by urllib3 will be + # overwritten. + del headers["Content-Type"] + # Ensures that dict objects are serialized + post_params = [(a, json.dumps(b)) if isinstance(b, dict) else (a, b) for a, b in post_params] + r = self.pool_manager.request( + method, + url, + fields=post_params, + encode_multipart=True, + timeout=timeout, + headers=headers, + preload_content=False, + ) + # Pass a `string` parameter directly in the body to support + # other content types than JSON when `body` argument is + # provided in serialized form. + elif isinstance(body, str) or isinstance(body, bytes): + r = self.pool_manager.request( + method, url, body=body, timeout=timeout, headers=headers, preload_content=False + ) + elif headers["Content-Type"].startswith("text/") and isinstance(body, bool): + request_body = "true" if body else "false" + r = self.pool_manager.request( + method, url, body=request_body, preload_content=False, timeout=timeout, headers=headers + ) + else: + # Cannot generate the request from given parameters + msg = """Cannot prepare a request message for provided + arguments. Please check that your arguments match + declared content type.""" + raise ApiException(status=0, reason=msg) + # For `GET`, `HEAD` + else: + r = self.pool_manager.request( + method, url, fields={}, timeout=timeout, headers=headers, preload_content=False + ) + except urllib3.exceptions.SSLError as e: + msg = "\n".join([type(e).__name__, str(e)]) + raise ApiException(status=0, reason=msg) + + return RESTResponse(r) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py new file mode 100644 index 0000000..5a78d9b --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py @@ -0,0 +1,35 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ContentType + + +class TestContentType(unittest.TestCase): + """ContentType unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def testContentType(self): + """Test ContentType""" + # inst = ContentType() + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py new file mode 100644 index 0000000..9504ab4 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters + + +class TestExtractionParameters(unittest.TestCase): + """ExtractionParameters unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionParameters: + """Test ExtractionParameters + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `ExtractionParameters` + """ + model = ExtractionParameters() + if include_optional: + return ExtractionParameters( + document_name = '', + confluence_kwargs = [ + {"value":"value","key":"key"} + ], + type = '' + ) + else: + return ExtractionParameters( + document_name = '', + type = '', + ) + """ + + def testExtractionParameters(self): + """Test ExtractionParameters""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py new file mode 100644 index 0000000..1401561 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest + + +class TestExtractionRequest(unittest.TestCase): + """ExtractionRequest unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> ExtractionRequest: + """Test ExtractionRequest + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `ExtractionRequest` + """ + model = ExtractionRequest() + if include_optional: + return ExtractionRequest( + path_on_s3 = '', + document_name = '' + ) + else: + return ExtractionRequest( + path_on_s3 = '', + document_name = '', + ) + """ + + def testExtractionRequest(self): + """Test ExtractionRequest""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py new file mode 100644 index 0000000..975a7bf --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py @@ -0,0 +1,39 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi + + +class TestExtractorApi(unittest.TestCase): + """ExtractorApi unit test stubs""" + + def setUp(self) -> None: + self.api = ExtractorApi() + + def tearDown(self) -> None: + pass + + def test_extract_from_file_post(self) -> None: + """Test case for extract_from_file_post""" + pass + + def test_extract_from_source(self) -> None: + """Test case for extract_from_source""" + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py new file mode 100644 index 0000000..479c858 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py @@ -0,0 +1,62 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import InformationPiece + + +class TestInformationPiece(unittest.TestCase): + """InformationPiece unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> InformationPiece: + """Test InformationPiece + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `InformationPiece` + """ + model = InformationPiece() + if include_optional: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE' + ) + else: + return InformationPiece( + metadata = [ + {"value":"value","key":"key"} + ], + page_content = '', + type = 'IMAGE', + ) + """ + + def testInformationPiece(self): + """Test InformationPiece""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py new file mode 100644 index 0000000..0ddc864 --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py @@ -0,0 +1,54 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +import unittest + +from admin_api_lib.extractor_api_client.openapi_client.models.key_value_pair import KeyValuePair + + +class TestKeyValuePair(unittest.TestCase): + """KeyValuePair unit test stubs""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def make_instance(self, include_optional) -> KeyValuePair: + """Test KeyValuePair + include_optional is a boolean, when False only required + params are included, when True both required and + optional params are included""" + # uncomment below to create an instance of `KeyValuePair` + """ + model = KeyValuePair() + if include_optional: + return KeyValuePair( + key = None, + value = None + ) + else: + return KeyValuePair( + ) + """ + + def testKeyValuePair(self): + """Test KeyValuePair""" + # inst_req_only = self.make_instance(include_optional=False) + # inst_req_and_optional = self.make_instance(include_optional=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 3adbae1..2a0f678 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -99,7 +99,6 @@ async def upload_source( ) -> None: await source_uploader.upload_source(str(request.base_url), type, name, kwargs) - @inject async def upload_file( self, @@ -109,7 +108,6 @@ async def upload_file( ) -> None: await file_uploader.upload_source(str(request.base_url), file) - @inject async def document_reference_id_get( self, diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 0dd5b4f..89d432c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -56,8 +56,7 @@ async def upload_source( file: UploadFile, ) -> None: self._background_threads = [t for t in self._background_threads if t.is_alive()] - - + try: content = await file.read() file.filename = sanitize_document_name(file.filename) @@ -65,10 +64,10 @@ async def upload_source( # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status + ) # TODO: change to pipeline with timeout to error status s3_path = await self._asave_new_document(content, file.filename, source_name) thread = Thread( - target=lambda: run(self._handle_source_upload(s3_path,source_name, file.filename, base_url)) + target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) ) thread.start() self._background_threads.append(thread) @@ -82,10 +81,10 @@ async def upload_source( async def _handle_source_upload( self, - s3_path:Path, + s3_path: Path, source_name: str, - file_name:str, - base_url: str, + file_name: str, + base_url: str, ): try: information_pieces = self._extractor_api.extract(s3_path, source_name) @@ -98,11 +97,11 @@ async def _handle_source_upload( chunked_documents = self._chunker.chunk(documents) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - self._add_file_url(file_name,base_url,enhanced_documents) + self._add_file_url(file_name, base_url, enhanced_documents) rag_information_pieces = [ self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] + ] # Replace old document try: await self._document_deleter.adelete_document(source_name) @@ -116,9 +115,7 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - def _add_file_url( - self, file: UploadFile, base_url: str, chunked_documents: list[Document] - ): + def _add_file_url(self, file: UploadFile, base_url: str, chunked_documents: list[Document]): document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" for idx, chunk in enumerate(chunked_documents): if chunk.metadata["id"] in chunk.metadata["related"]: @@ -135,8 +132,8 @@ async def _asave_new_document( self, file_content: bytes, filename: str, - source_name:str, - )->Path: + source_name: str, + ) -> Path: try: with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = Path(temp_dir) / filename diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 5ef6a72..81df19c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -61,10 +61,8 @@ async def upload_source( # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status - thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs)) - ) + ) # TODO: change to pipeline with timeout to error status + thread = Thread(target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs))) thread.start() self._background_threads.append(thread) except ValueError as e: diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index 81ca3e2..ebfad6c 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -5,12 +5,12 @@ info: servers: - url: / paths: - /extract: + /extract_from_file: post: - operationId: extract + operationId: extract_from_file_post requestBody: content: - multipart/form-data: + application/json: schema: $ref: '#/components/schemas/extraction_request' required: true @@ -24,13 +24,56 @@ paths: type: array description: List of extracted information. "422": - description: Body is not a valid source. + description: Body is not a valid PDF. "500": description: Something somewhere went terribly wrong. tags: - extractor + /extract_from_source: + post: + operationId: extract_from_source + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_parameters' + required: true + responses: + "200": + content: + application/json: + schema: + items: + $ref: '#/components/schemas/information_piece' + type: array + description: ok + "404": + description: not found + "422": + description: unprocessable entity + "500": + description: internal server error + tags: + - extractor components: schemas: + extraction_request: + description: "" + example: + path_on_s3: path on s3 + properties: + path_on_s3: + description: "" + title: PathOnS3 + type: string + document_name: + description: "" + type: string + required: + - document_name + - path_on_s3 + title: ExtractionRequest + type: object key_value_pair: description: "" example: @@ -81,24 +124,26 @@ components: - type title: InformationPiece type: object - extraction_request: + extraction_parameters: description: "" properties: - file: - description: "" - type: file - type: - description: "" + document_name: + description: The name that will be used to store the confluence db in the + key value db and the vectordatabase (metadata.document). + title: document_name type: string - kwargs: - description: "" + confluence_kwargs: + description: Kwargs for the extractor items: $ref: '#/components/schemas/key_value_pair' + title: confluence_kwargs type: array - name: - description: "" + type: + description: Extractortype + title: type type: string required: - - name + - document_name - type + title: confluence_parameters type: object diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 38c9a1d..47479f0 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,11 +1,11 @@ # coding: utf-8 -from typing import Annotated, Dict, List # noqa: F401 +from typing import Dict, List # noqa: F401 import importlib import pkgutil from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -import extractor_api_lib.impl +import openapi_server.impl from fastapi import ( # noqa: F401 APIRouter, @@ -23,36 +23,51 @@ ) from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 -from pydantic import StrictBytes, StrictStr -from fastapi import Request, Response, UploadFile -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair router = APIRouter() -ns_pkg = extractor_api_lib.impl +ns_pkg = openapi_server.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) @router.post( - "/extract", + "/extract_from_file", responses={ 200: {"model": List[InformationPiece], "description": "List of extracted information."}, - 422: {"description": "Body is not a valid source."}, + 422: {"description": "Body is not a valid PDF."}, 500: {"description": "Something somewhere went terribly wrong."}, }, tags=["extractor"], response_model_by_alias=True, ) -async def extract( - type: Annotated[str, Form()], - name: Annotated[str, Form()], - file: Optional[UploadFile] = None, - kwargs: Optional[Annotated[List[KeyValuePair], Form()]] = None, +async def extract_from_file_post( + extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseExtractorApi.subclasses[0]().extract(type, name, file, kwargs) + return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) + + +@router.post( + "/extract_from_source", + responses={ + 200: {"model": List[InformationPiece], "description": "ok"}, + 404: {"description": "not found"}, + 422: {"description": "unprocessable entity"}, + 500: {"description": "internal server error"}, + }, + tags=["extractor"], + response_model_by_alias=True, +) +async def extract_from_source( + extraction_parameters: ExtractionParameters = Body(None, description=""), +) -> List[InformationPiece]: + if not BaseExtractorApi.subclasses: + raise HTTPException(status_code=500, detail="Not implemented") + return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index f7a7cf0..b1bac98 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -2,11 +2,10 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from pydantic import StrictBytes, StrictStr -from typing import Any, List, Optional, Tuple, Union -from fastapi import Request, Response, UploadFile +from typing import Any, List +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair class BaseExtractorApi: @@ -16,10 +15,12 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,) - async def extract( + async def extract_from_file_post( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], + extraction_request: ExtractionRequest, + ) -> List[InformationPiece]: ... + + async def extract_from_source( + self, + extraction_parameters: ExtractionParameters, ) -> List[InformationPiece]: ... diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index 553d79a..e9602d4 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -45,7 +45,7 @@ async def aextract_content(self, file_path: Path, name: str) -> list[InternalInf Path to the file the information should be extracted from. name : str Name of the document. - + Returns ------- list[InformationPiece] diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py new file mode 100644 index 0000000..3aed2ca --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -0,0 +1,105 @@ +# coding: utf-8 + +""" +extractor-api-lib + +No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) + +The version of the OpenAPI document: 1.0.0 +Generated by OpenAPI Generator (https://openapi-generator.tech) + +Do not edit the class manually. +""" # noqa: E501 + + +from __future__ import annotations +import pprint +import re # noqa: F401 +import json + + +from pydantic import BaseModel, ConfigDict, Field, StrictStr +from typing import Any, ClassVar, Dict, List, Optional +from extractor_api_lib.models.key_value_pair import KeyValuePair + +try: + from typing import Self +except ImportError: + from typing_extensions import Self + + +class ExtractionParameters(BaseModel): + """ """ # noqa: E501 + + document_name: StrictStr = Field( + description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." + ) + confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + + model_config = { + "populate_by_name": True, + "validate_assignment": True, + "protected_namespaces": (), + } + + def to_str(self) -> str: + """Returns the string representation of the model using alias""" + return pprint.pformat(self.model_dump(by_alias=True)) + + def to_json(self) -> str: + """Returns the JSON representation of the model using alias""" + # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> Self: + """Create an instance of ExtractionParameters from a JSON string""" + return cls.from_dict(json.loads(json_str)) + + def to_dict(self) -> Dict[str, Any]: + """Return the dictionary representation of the model using alias. + + This has the following differences from calling pydantic's + `self.model_dump(by_alias=True)`: + + * `None` is only added to the output dict for nullable fields that + were set at model initialization. Other fields with value `None` + are ignored. + """ + _dict = self.model_dump( + by_alias=True, + exclude={}, + exclude_none=True, + ) + # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + _items = [] + if self.confluence_kwargs: + for _item in self.confluence_kwargs: + if _item: + _items.append(_item.to_dict()) + _dict["confluence_kwargs"] = _items + return _dict + + @classmethod + def from_dict(cls, obj: Dict) -> Self: + """Create an instance of ExtractionParameters from a dict""" + if obj is None: + return None + + if not isinstance(obj, dict): + return cls.model_validate(obj) + + _obj = cls.model_validate( + { + "document_name": obj.get("document_name"), + "confluence_kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("confluence_kwargs")] + if obj.get("confluence_kwargs") is not None + else None + ), + "type": obj.get("type"), + } + ) + return _obj diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 8917378..769b658 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -18,9 +18,8 @@ import json -from pydantic import BaseModel, ConfigDict, StrictBytes, StrictStr -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union -from extractor_api_lib.models.key_value_pair import KeyValuePair +from pydantic import BaseModel, ConfigDict, StrictStr +from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -31,10 +30,9 @@ class ExtractionRequest(BaseModel): """ """ # noqa: E501 - file: Optional[Union[StrictBytes, StrictStr, Tuple[StrictStr, StrictBytes]]] = None - type: StrictStr - kwargs: Optional[List[KeyValuePair]] = None - __properties: ClassVar[List[str]] = ["file", "type", "kwargs"] + path_on_s3: StrictStr + document_name: StrictStr + __properties: ClassVar[List[str]] = ["path_on_s3", "document_name"] model_config = { "populate_by_name": True, @@ -71,13 +69,6 @@ def to_dict(self) -> Dict[str, Any]: exclude={}, exclude_none=True, ) - # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) - _items = [] - if self.kwargs: - for _item in self.kwargs: - if _item: - _items.append(_item.to_dict()) - _dict["kwargs"] = _items return _dict @classmethod @@ -89,15 +80,5 @@ def from_dict(cls, obj: Dict) -> Self: if not isinstance(obj, dict): return cls.model_validate(obj) - _obj = cls.model_validate( - { - "file": obj.get("file"), - "type": obj.get("type"), - "kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] - if obj.get("kwargs") is not None - else None - ), - } - ) + _obj = cls.model_validate({"path_on_s3": obj.get("path_on_s3"), "document_name": obj.get("document_name")}) return _obj From f10aa41bbad95925adf37393f20fea0c8a63e959 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Thu, 15 May 2025 14:01:33 +0200 Subject: [PATCH 11/56] wip --- .../src/admin_api_lib/dependency_container.py | 24 +++++++++++++++++-- .../models/extraction_parameters.py | 24 +++++++++---------- .../api_endpoints/default_file_uploader.py | 9 ++++--- .../api_endpoints/default_source_uploader.py | 9 ++++--- extractor-api-lib/openapi.yaml | 10 ++++---- .../models/extraction_parameters.py | 20 ++++++++-------- 6 files changed, 61 insertions(+), 35 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 93b3ab2..2ae6a1d 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -1,5 +1,6 @@ """Module for the DependencyContainer class.""" +from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import ( # noqa: WOT001 Configuration, @@ -11,7 +12,13 @@ from langchain_community.llms import Ollama, VLLMOpenAI from langfuse import Langfuse -from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ( + ExtractorApi, +) +from admin_api_lib.extractor_api_client.openapi_client.api_client import ApiClient +from admin_api_lib.extractor_api_client.openapi_client.configuration import ( + Configuration as ExtractorConfiguration, +) from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.impl.api_endpoints.default_document_deleter import ( DefaultDocumentDeleter, @@ -87,7 +94,9 @@ class DependencyContainer(DeclarativeContainer): ) chunker = Singleton(TextChunker, text_splitter) - document_extractor = Singleton(ExtractorApiClient, document_extractor_settings.host) + extractor_api_configuration = Singleton(ExtractorConfiguration, host=document_extractor_settings.host) + document_extractor_api_client = Singleton(ApiClient, extractor_api_configuration) + document_extractor = Singleton(ExtractorApi, document_extractor_api_client) rag_api_configuration = Singleton(RagConfiguration, host=rag_api_settings.host) rag_api_client = Singleton(RagApiClient, configuration=rag_api_configuration) @@ -159,3 +168,14 @@ class DependencyContainer(DeclarativeContainer): key_value_store=key_value_store, document_deleter=document_deleter, ) + + file_uploader = Singleton( + DefaultFileUploader, + extractor_api=document_extractor, + rag_api=rag_api, + information_enhancer=information_enhancer, + information_mapper=information_mapper, + chunker=chunker, + key_value_store=key_value_store, + document_deleter=document_deleter, + ) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py index da4408d..37db1e8 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -30,9 +30,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") type: StrictStr = Field(description="Extractortype") - __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] model_config = ConfigDict( populate_by_name=True, @@ -71,13 +71,13 @@ def to_dict(self) -> Dict[str, Any]: exclude=excluded_fields, exclude_none=True, ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) _items = [] - if self.confluence_kwargs: - for _item_confluence_kwargs in self.confluence_kwargs: - if _item_confluence_kwargs: - _items.append(_item_confluence_kwargs.to_dict()) - _dict["confluence_kwargs"] = _items + if self.kwargs: + for _item_kwargs in self.kwargs: + if _item_kwargs: + _items.append(_item_kwargs.to_dict()) + _dict["kwargs"] = _items return _dict @classmethod @@ -92,12 +92,12 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj["confluence_kwargs"]] - if obj.get("confluence_kwargs") is not None + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] + if obj.get("kwargs") is not None else None ), - "type": obj.get("type"), } ) return _obj diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 89d432c..124d895 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -9,6 +9,8 @@ import tempfile from urllib.request import Request +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from extractor_api_lib.models.extraction_request import ExtractionRequest from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document @@ -21,7 +23,6 @@ from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status -from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name @@ -33,7 +34,7 @@ class DefaultFileUploader(FileUploader): def __init__( self, - extractor_api: ExtractorApiClient, + extractor_api: ExtractorApi, key_value_store: FileStatusKeyValueStore, information_enhancer: InformationEnhancer, chunker: Chunker, @@ -87,7 +88,9 @@ async def _handle_source_upload( base_url: str, ): try: - information_pieces = self._extractor_api.extract(s3_path, source_name) + information_pieces = self._extractor_api.extract_from_file_post( + ExtractionRequest(path_on_s3=s3_path, document_name=source_name) + ) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 81df19c..1637595 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -7,6 +7,8 @@ import urllib import tempfile +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document @@ -19,7 +21,6 @@ from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status -from admin_api_lib.extractor_api_client.extractor_api_client import ExtractorApiClient from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name @@ -31,7 +32,7 @@ class DefaultSourceUploader(SourceUploader): def __init__( self, - extractor_api: ExtractorApiClient, + extractor_api: ExtractorApi, key_value_store: FileStatusKeyValueStore, information_enhancer: InformationEnhancer, chunker: Chunker, @@ -81,7 +82,9 @@ async def _handle_source_upload( kwargs: list[KeyValuePair], ): try: - information_pieces = self._extractor_api.extract(type, source_name, kwargs) + information_pieces = self._extractor_api.extract_from_source( + ExtractionParameters(type=type, document_name=source_name, kwargs=kwargs) + ) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index ebfad6c..d178a86 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -132,16 +132,16 @@ components: key value db and the vectordatabase (metadata.document). title: document_name type: string - confluence_kwargs: + type: + description: Extractortype + title: type + type: string + kwargs: description: Kwargs for the extractor items: $ref: '#/components/schemas/key_value_pair' title: confluence_kwargs type: array - type: - description: Extractortype - title: type - type: string required: - document_name - type diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py index 3aed2ca..d701978 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -34,9 +34,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - confluence_kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") type: StrictStr = Field(description="Extractortype") - __properties: ClassVar[List[str]] = ["document_name", "confluence_kwargs", "type"] + kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") + __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] model_config = { "populate_by_name": True, @@ -73,13 +73,13 @@ def to_dict(self) -> Dict[str, Any]: exclude={}, exclude_none=True, ) - # override the default output from pydantic by calling `to_dict()` of each item in confluence_kwargs (list) + # override the default output from pydantic by calling `to_dict()` of each item in kwargs (list) _items = [] - if self.confluence_kwargs: - for _item in self.confluence_kwargs: + if self.kwargs: + for _item in self.kwargs: if _item: _items.append(_item.to_dict()) - _dict["confluence_kwargs"] = _items + _dict["kwargs"] = _items return _dict @classmethod @@ -94,12 +94,12 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "confluence_kwargs": ( - [KeyValuePair.from_dict(_item) for _item in obj.get("confluence_kwargs")] - if obj.get("confluence_kwargs") is not None + "type": obj.get("type"), + "kwargs": ( + [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] + if obj.get("kwargs") is not None else None ), - "type": obj.get("type"), } ) return _obj From 96e53e7d5a2bab796d626956a0ecac38e5ab25e9 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:14:41 +0200 Subject: [PATCH 12/56] fix --- .../src/admin_api_lib/dependency_container.py | 1 + .../src/admin_api_lib/impl/admin_api.py | 12 ++-- .../api_endpoints/default_file_uploader.py | 23 ++++--- .../api_endpoints/default_source_uploader.py | 8 +-- .../impl/mapper/informationpiece2document.py | 4 +- .../api_endpoints/file_extractor.py | 23 +++++++ .../{extractor.py => source_extractor.py} | 14 ++--- .../extractor_api_lib/apis/extractor_api.py | 4 +- .../extractor_api_lib/dependency_container.py | 12 ++-- .../extractors/information_extractor.py | 12 ++-- .../general_file_extractor.py | 38 ++++-------- ...tractor.py => general_source_extractor.py} | 14 ++--- .../impl/extractor_api_impl.py | 38 +++++------- .../impl/extractors/confluence_extractor.py | 10 ++- ...ce_langchain_document2information_piece.py | 62 ++++--------------- 15 files changed, 120 insertions(+), 155 deletions(-) create mode 100644 extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py rename extractor-api-lib/src/extractor_api_lib/api_endpoints/{extractor.py => source_extractor.py} (60%) rename extractor-api-lib/src/extractor_api_lib/impl/{extractors => api_endpoints}/general_file_extractor.py (71%) rename extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/{default_extractor.py => general_source_extractor.py} (87%) diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 2ae6a1d..640ea72 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -178,4 +178,5 @@ class DependencyContainer(DeclarativeContainer): chunker=chunker, key_value_store=key_value_store, document_deleter=document_deleter, + file_service=file_service, ) diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 2a0f678..b05d7d7 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -2,15 +2,15 @@ import logging from typing import List, Optional -from pydantic import Field, StrictBytes, StrictStr -from admin_api_lib.api_endpoints.source_uploader import SourceUploader -from admin_api_lib.models.key_value_pair import KeyValuePair -from admin_api_lib.models.upload_source import UploadSource + +from pydantic import Field, StrictBytes, StrictStr from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile - +from admin_api_lib.api_endpoints.file_uploader import FileUploader +from admin_api_lib.api_endpoints.source_uploader import SourceUploader +from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter from admin_api_lib.api_endpoints.document_reference_retriever import ( DocumentReferenceRetriever, @@ -106,7 +106,7 @@ async def upload_file( request: Request, file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), ) -> None: - await file_uploader.upload_source(str(request.base_url), file) + await file_uploader.upload_file(str(request.base_url), file) @inject async def document_reference_id_get( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 124d895..703e3b8 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -9,18 +9,21 @@ import tempfile from urllib.request import Request -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi -from extractor_api_lib.models.extraction_request import ExtractionRequest + + + +from admin_api_lib.file_services.file_service import FileService from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run -from admin_api_lib.models.key_value_pair import KeyValuePair +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest +from admin_api_lib.api_endpoints.file_uploader import FileUploader +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document from admin_api_lib.api_endpoints.document_deleter import DocumentDeleter -from admin_api_lib.api_endpoints.source_uploader import SourceUploader from admin_api_lib.chunker.chunker import Chunker from admin_api_lib.models.status import Status from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore @@ -41,6 +44,7 @@ def __init__( document_deleter: DocumentDeleter, rag_api: RagApi, information_mapper: InformationPiece2Document, + file_service: FileService, ): self._extractor_api = extractor_api self._rag_api = rag_api @@ -50,8 +54,9 @@ def __init__( self._chunker = chunker self._document_deleter = document_deleter self._background_threads = [] + self._file_service = file_service - async def upload_source( + async def upload_file( self, base_url: str, file: UploadFile, @@ -89,7 +94,7 @@ async def _handle_source_upload( ): try: information_pieces = self._extractor_api.extract_from_file_post( - ExtractionRequest(path_on_s3=s3_path, document_name=source_name) + ExtractionRequest(path_on_s3=str(s3_path), document_name=source_name) ) if not information_pieces: @@ -118,8 +123,8 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - def _add_file_url(self, file: UploadFile, base_url: str, chunked_documents: list[Document]): - document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" + def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]): + document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}" for idx, chunk in enumerate(chunked_documents): if chunk.metadata["id"] in chunk.metadata["related"]: chunk.metadata["related"].remove(chunk.metadata["id"]) @@ -146,7 +151,7 @@ async def _asave_new_document( logger.debug("Temp file created and content written.") self._file_service.upload_file(Path(temp_file_path), filename) - return Path(temp_file_path) + return filename except Exception as e: logger.error("Error during document saving: %s %s", e, traceback.format_exc()) self._key_value_store.upsert(source_name, Status.ERROR) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 1637595..deb8cac 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -6,14 +6,13 @@ from threading import Thread import urllib import tempfile - -from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi -from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run +from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi +from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document @@ -79,11 +78,12 @@ async def _handle_source_upload( source_name: str, base_url: str, type: StrictStr, + name:str, kwargs: list[KeyValuePair], ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(type=type, document_name=source_name, kwargs=kwargs) + ExtractionParameters(type=type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) ) if not information_pieces: diff --git a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py index 6f0ac2f..a3a40ce 100644 --- a/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py +++ b/admin-api-lib/src/admin_api_lib/impl/mapper/informationpiece2document.py @@ -4,10 +4,10 @@ from langchain_core.documents import Document as LangchainDocument -from admin_api_lib.extractor_api_client.models.content_type import ( +from admin_api_lib.extractor_api_client.openapi_client.models.content_type import ( ContentType as ExtractorInformaType, ) -from admin_api_lib.extractor_api_client.models.information_piece import ( +from admin_api_lib.extractor_api_client.openapi_client.models.information_piece import ( InformationPiece as ExtractorInformationPiece, ) from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py new file mode 100644 index 0000000..499a09d --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from extractor_api_lib.models.extraction_request import ExtractionRequest +from extractor_api_lib.models.information_piece import InformationPiece + + +class FileExtractor(ABC): + """Abstract base class for extract_information endpoint.""" + + @abstractmethod + async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: + """ + Extract information of a document, given by the extraction_request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request containing the details of the document to be processed for information extraction. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces from the document. + """ \ No newline at end of file diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py similarity index 60% rename from extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py rename to extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py index c3f254b..44b5c38 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from typing import Optional +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from pydantic import StrictStr from fastapi import UploadFile @@ -8,23 +9,20 @@ from extractor_api_lib.models.key_value_pair import KeyValuePair -class Extractor(ABC): +class SourceExtractor(ABC): @abstractmethod async def aextract_information( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InformationPiece]: """ - Extract information from confluence, using the given confluence parameters. + Extract information from source, using the given parameters. Parameters ---------- - confluence_parameters : ConfluenceParameters - The parameters used to extract information from Confluence. + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 47479f0..fc3d0ee 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -5,7 +5,7 @@ import pkgutil from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi -import openapi_server.impl +import extractor_api_lib.impl from fastapi import ( # noqa: F401 APIRouter, @@ -31,7 +31,7 @@ router = APIRouter() -ns_pkg = openapi_server.impl +ns_pkg = extractor_api_lib.impl for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."): importlib.import_module(name) diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index 2c5c53f..a4adfe0 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -3,12 +3,12 @@ from dependency_injector.containers import DeclarativeContainer from dependency_injector.providers import List, Singleton # noqa: WOT001 -from extractor_api_lib.impl.api_endpoints.default_extractor import DefaultExtractor +from extractor_api_lib.impl.api_endpoints.general_source_extractor import GeneralSourceExtractor from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import MSDocsExtractor from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor -from extractor_api_lib.impl.extractors.general_file_extractor import GeneralFileExtractor +from extractor_api_lib.impl.api_endpoints.general_file_extractor import GeneralFileExtractor from extractor_api_lib.impl.file_services.s3_service import S3Service from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, @@ -38,11 +38,11 @@ class DependencyContainer(DeclarativeContainer): langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors,intern2external) confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - default_extractor = Singleton( - DefaultExtractor, + source_extractor = Singleton( + GeneralSourceExtractor, mapper=intern2external, - available_extractors=List(general_file_extractor, confluence_extractor), + available_extractors=List(confluence_extractor), ) diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py index eeaadf1..92c71c3 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -4,6 +4,7 @@ from typing import Optional +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from fastapi import UploadFile from pydantic import StrictStr @@ -23,18 +24,15 @@ def extractor_type(self) -> ExtractorTypes: ... @abstractmethod async def aextract_content( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ - Extract content from given file. + Extract content from source. Parameters ---------- - file_path : Path - Path to the file the information should be extracted from. + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. Returns ------- diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py similarity index 71% rename from extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py index 04abb2c..505431f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -7,9 +7,9 @@ from typing import Any, List, Optional -from pydantic import StrictStr -from fastapi import UploadFile - +from extractor_api_lib.api_endpoints.file_extractor import FileExtractor +from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece +from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.extractors.information_extractor import InformationExtractor @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) -class GeneralFileExtractor(InformationExtractor): +class GeneralFileExtractor(FileExtractor): """A class to extract information from documents using available extractors. This class serves as a general extractor that utilizes a list of available @@ -29,7 +29,7 @@ class GeneralFileExtractor(InformationExtractor): appropriate extractor based on the file type of the document. """ - def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor]): + def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor], mapper: Internal2ExternalInformationPiece): """ Initialize the GeneralExtractor. @@ -42,18 +42,9 @@ def __init__(self, file_service: FileService, available_extractors: list[Informa """ self._file_service = file_service self._available_extractors = available_extractors + self._mapper = mapper - @property - def extractor_type(self) -> ExtractorTypes: - return ExtractorTypes.FILE - - async def aextract_content( - self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[List[KeyValuePair]], - ) -> list[InternalInformationPiece]: + async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: """ Extract content from given file. @@ -66,25 +57,22 @@ async def aextract_content( ------- list[InformationPiece] The extracted information. - """ - # save file on s3 - content = await file.read() - filename = file.filename + """ try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_file_path = Path(temp_dir) / filename + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = Path(temp_dir) / Path(extraction_request.path_on_s3).name with open(temp_file_path, "wb") as temp_file: + self._file_service.download_file(extraction_request.path_on_s3,temp_file) logger.debug("Temporary file created at %s.", temp_file_path) - temp_file.write(content) logger.debug("Temp file created and content written.") - self._file_service.upload_file(temp_file_path, filename) file_type = str(temp_file_path).split(".")[-1].upper() correct_extractors = [ x for x in self._available_extractors if file_type in [y.value for y in x.compatible_file_types] ] if not correct_extractors: raise ValueError(f"No extractor found for file-ending {file_type}") - return await correct_extractors[-1].aextract_content(temp_file_path, name) + results = await correct_extractors[-1].aextract_content(temp_file_path, extraction_request.document_name) + return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] except Exception as e: logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) raise e diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py similarity index 87% rename from extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py rename to extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index b485c1e..7e135b6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -3,6 +3,7 @@ import logging from typing import Optional +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from pydantic import StrictStr from fastapi import UploadFile @@ -10,7 +11,7 @@ from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece -from extractor_api_lib.api_endpoints.extractor import Extractor +from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair @@ -21,7 +22,7 @@ logger = logging.getLogger(__name__) -class DefaultExtractor(Extractor): +class GeneralSourceExtractor(SourceExtractor): """A class to extract information from documents using available extractors. This class serves as a general extractor that utilizes a list of available @@ -43,10 +44,7 @@ def __init__(self, available_extractors: list[InformationExtractor], mapper: Int async def aextract_information( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InformationPiece]: """ Extract content from given file. @@ -61,8 +59,8 @@ async def aextract_information( list[InformationPiece] The extracted information. """ - correct_extractors = [x for x in self._available_extractors if type == x.extractor_type] + correct_extractors = [x for x in self._available_extractors if extraction_parameters.type == x.extractor_type] if not correct_extractors: raise ValueError(f"No extractor found for type {type}") - results = await correct_extractors[-1].aextract_content(type, name, file, kwargs) + results = await correct_extractors[-1].aextract_content(extraction_parameters) return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index bfe9393..df8a59f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,7 +1,10 @@ """Module for the implementation of the ExtractorApi interface.""" from dependency_injector.wiring import Provide, inject -from extractor_api_lib.api_endpoints.extractor import Extractor +from extractor_api_lib.api_endpoints.file_extractor import FileExtractor +from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.extraction_request import ExtractionRequest from fastapi import Depends, UploadFile from pydantic import StrictStr @@ -18,27 +21,16 @@ class ExtractorApiImpl(BaseExtractorApi): """Implementation of the ExtractorApi interface.""" @inject - async def extract( + async def extract_from_file_post( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], - extractor: Extractor = Depends(Provide[DependencyContainer.default_extractor]), - ) -> list[InformationPiece]: - """ - Extract information from a source. + extraction_request: ExtractionRequest, + extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), + ) -> list[InformationPiece]: + return await extractor.aextract_information(extraction_request) - Parameters - ---------- - extraction_request : ExtractionRequest - The request containing details about the extraction process. - file_extractor : FileExtractor, optional - The file extractor dependency, by default Depends(Provide[DependencyContainer.file_extractor]). - - Returns - ------- - list[InformationPiece] - A list of extracted information pieces. - """ - return await extractor.aextract_information(type, name, file, kwargs) + async def extract_from_source( + self, + extraction_parameters: ExtractionParameters, + extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), + ) -> list[InformationPiece]: + return await extractor.aextract_information(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 1f7c666..faf9c4e 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -3,6 +3,7 @@ from typing import Optional from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from pydantic import StrictStr from langchain_community.document_loaders import ConfluenceLoader from fastapi import UploadFile @@ -40,10 +41,7 @@ def extractor_type(self) -> ExtractorTypes: async def aextract_content( self, - type: StrictStr, - name: StrictStr, - file: Optional[UploadFile], - kwargs: Optional[list[KeyValuePair]], + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ Asynchronously extracts information pieces from Confluence. @@ -59,10 +57,10 @@ async def aextract_content( A list of information pieces extracted from Confluence. """ # Convert list of key value pairs to dict - confluence_loader_parameters = {x.key: x.value for x in kwargs} + confluence_loader_parameters = {x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs} # Drop the document_name parameter as it is not used by the ConfluenceLoader if "document_name" in confluence_loader_parameters: confluence_loader_parameters.pop("document_name", None) document_loader = ConfluenceLoader(**confluence_loader_parameters) documents = document_loader.load() - return [self.mapper.map_document2informationpiece(x) for x in documents] + return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 96e6efe..77e5435 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -1,5 +1,6 @@ """Module for the ConfluenceLangchainDocument2InformationPiece class.""" +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from langchain_core.documents import Document as LangchainDocument from extractor_api_lib.models.confluence_parameters import ConfluenceParameters @@ -35,35 +36,7 @@ class ConfluenceLangchainDocument2InformationPiece: USE_CASE_RELATED_KEY = "related" DOCUMENT_KEY = "document" - def __init__(self) -> None: - """Initialize the ConfluenceLangchainDocument2InformationPiece instance.""" - self._confluence_parameters = None - - @property - def confluence_parameters(self): - """ - Property that returns the Confluence parameters. - - Returns - ------- - dict - A dictionary containing the Confluence parameters. - """ - return self._confluence_parameters - - @confluence_parameters.setter - def confluence_parameters(self, confluence_parameters: ConfluenceParameters): - """ - Set the confluence parameters. - - Parameters - ---------- - confluence_parameters : ConfluenceParameters - The confluence parameters to be set. - """ - self._confluence_parameters = confluence_parameters - - def map_document2informationpiece(self, document: LangchainDocument) -> InformationPiece: + def map_document2informationpiece(self, document: LangchainDocument, document_name:str) -> InternalInformationPiece: """ Map a LangchainDocument to an InformationPiece. @@ -81,28 +54,19 @@ def map_document2informationpiece(self, document: LangchainDocument) -> Informat ------ ValueError If Confluence parameters are not set before mapping documents. - """ - if self._confluence_parameters is None: - raise ValueError("Confluence parameters must be set before mapping documents") + """ + meta = self._map_meta(document.metadata, document_name) + return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - meta = self._map_meta(document.metadata) - return InformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - - def _map_meta(self, internal: dict) -> list[MetaInformationPiece]: - metadata = [] + def _map_meta(self, internal: dict, document_name:str) -> dict: + metadata = {} for key, value in internal.items(): - metadata.append( - MetaInformationPiece( - key=self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key, - value=value, - ) - ) - page_title_matches = [m.value for m in metadata if m.key == self.CONFLUENCE_LOADER_TITLE_KEY] + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key]=value + + page_title_matches = [v for k,v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" - metadata.append(MetaInformationPiece(key=self.USER_CASE_PAGE_KEY, value=page_title)) - metadata.append( - MetaInformationPiece(key=self.DOCUMENT_KEY, value=self._confluence_parameters.document_name) - ) - metadata.append(MetaInformationPiece(key=self.USE_CASE_RELATED_KEY, value=[])) + metadata[self.USER_CASE_PAGE_KEY]=page_title + metadata[self.DOCUMENT_KEY]=document_name + metadata[self.USE_CASE_RELATED_KEY]=[] return metadata From a1f8feeb513025c8290a935d064c13442fbcdf80 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:15:34 +0200 Subject: [PATCH 13/56] black --- .../api_endpoints/default_file_uploader.py | 2 -- .../api_endpoints/default_source_uploader.py | 2 +- .../api_endpoints/file_extractor.py | 2 +- .../extractor_api_lib/dependency_container.py | 4 ++-- .../extractors/information_extractor.py | 2 +- .../api_endpoints/general_file_extractor.py | 17 ++++++++++++----- .../impl/extractor_api_impl.py | 4 ++-- .../impl/extractors/confluence_extractor.py | 6 ++++-- ...nce_langchain_document2information_piece.py | 18 ++++++++++-------- 9 files changed, 33 insertions(+), 24 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 703e3b8..37a8e28 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -10,8 +10,6 @@ from urllib.request import Request - - from admin_api_lib.file_services.file_service import FileService from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index deb8cac..ab1e153 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -78,7 +78,7 @@ async def _handle_source_upload( source_name: str, base_url: str, type: StrictStr, - name:str, + name: str, kwargs: list[KeyValuePair], ): try: diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py index 499a09d..ad968a2 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -20,4 +20,4 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l ------- list[InformationPiece] A list of extracted information pieces from the document. - """ \ No newline at end of file + """ diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index a4adfe0..ad671d9 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -38,11 +38,11 @@ class DependencyContainer(DeclarativeContainer): langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) - general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors,intern2external) + general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external) confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) source_extractor = Singleton( GeneralSourceExtractor, mapper=intern2external, - available_extractors=List(confluence_extractor), + available_extractors=List(confluence_extractor), ) diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py index 92c71c3..35952cf 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -24,7 +24,7 @@ def extractor_type(self) -> ExtractorTypes: ... @abstractmethod async def aextract_content( self, - extraction_parameters: ExtractionParameters, + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ Extract content from source. diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py index 505431f..8ed9e8b 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -29,7 +29,12 @@ class GeneralFileExtractor(FileExtractor): appropriate extractor based on the file type of the document. """ - def __init__(self, file_service: FileService, available_extractors: list[InformationFileExtractor], mapper: Internal2ExternalInformationPiece): + def __init__( + self, + file_service: FileService, + available_extractors: list[InformationFileExtractor], + mapper: Internal2ExternalInformationPiece, + ): """ Initialize the GeneralExtractor. @@ -57,12 +62,12 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l ------- list[InformationPiece] The extracted information. - """ + """ try: - with tempfile.TemporaryDirectory() as temp_dir: + with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = Path(temp_dir) / Path(extraction_request.path_on_s3).name with open(temp_file_path, "wb") as temp_file: - self._file_service.download_file(extraction_request.path_on_s3,temp_file) + self._file_service.download_file(extraction_request.path_on_s3, temp_file) logger.debug("Temporary file created at %s.", temp_file_path) logger.debug("Temp file created and content written.") file_type = str(temp_file_path).split(".")[-1].upper() @@ -71,7 +76,9 @@ async def aextract_information(self, extraction_request: ExtractionRequest) -> l ] if not correct_extractors: raise ValueError(f"No extractor found for file-ending {file_type}") - results = await correct_extractors[-1].aextract_content(temp_file_path, extraction_request.document_name) + results = await correct_extractors[-1].aextract_content( + temp_file_path, extraction_request.document_name + ) return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] except Exception as e: logger.error("Error during document parsing: %s %s", e, traceback.format_exc()) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index df8a59f..50a8623 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -25,12 +25,12 @@ async def extract_from_file_post( self, extraction_request: ExtractionRequest, extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), - ) -> list[InformationPiece]: + ) -> list[InformationPiece]: return await extractor.aextract_information(extraction_request) async def extract_from_source( self, extraction_parameters: ExtractionParameters, extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), - ) -> list[InformationPiece]: + ) -> list[InformationPiece]: return await extractor.aextract_information(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index faf9c4e..8b1c07e 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -41,7 +41,7 @@ def extractor_type(self) -> ExtractorTypes: async def aextract_content( self, - extraction_parameters: ExtractionParameters, + extraction_parameters: ExtractionParameters, ) -> list[InternalInformationPiece]: """ Asynchronously extracts information pieces from Confluence. @@ -57,7 +57,9 @@ async def aextract_content( A list of information pieces extracted from Confluence. """ # Convert list of key value pairs to dict - confluence_loader_parameters = {x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs} + confluence_loader_parameters = { + x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs + } # Drop the document_name parameter as it is not used by the ConfluenceLoader if "document_name" in confluence_loader_parameters: confluence_loader_parameters.pop("document_name", None) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 77e5435..85b92bd 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -36,7 +36,9 @@ class ConfluenceLangchainDocument2InformationPiece: USE_CASE_RELATED_KEY = "related" DOCUMENT_KEY = "document" - def map_document2informationpiece(self, document: LangchainDocument, document_name:str) -> InternalInformationPiece: + def map_document2informationpiece( + self, document: LangchainDocument, document_name: str + ) -> InternalInformationPiece: """ Map a LangchainDocument to an InformationPiece. @@ -54,19 +56,19 @@ def map_document2informationpiece(self, document: LangchainDocument, document_na ------ ValueError If Confluence parameters are not set before mapping documents. - """ + """ meta = self._map_meta(document.metadata, document_name) return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - def _map_meta(self, internal: dict, document_name:str) -> dict: + def _map_meta(self, internal: dict, document_name: str) -> dict: metadata = {} for key, value in internal.items(): - metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key]=value + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key] = value - page_title_matches = [v for k,v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] + page_title_matches = [v for k, v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" - metadata[self.USER_CASE_PAGE_KEY]=page_title - metadata[self.DOCUMENT_KEY]=document_name - metadata[self.USE_CASE_RELATED_KEY]=[] + metadata[self.USER_CASE_PAGE_KEY] = page_title + metadata[self.DOCUMENT_KEY] = document_name + metadata[self.USE_CASE_RELATED_KEY] = [] return metadata From 54f3c32e7a50291956bd9db81aab059a82f91e72 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:46:15 +0200 Subject: [PATCH 14/56] linting --- admin-api-lib/pyproject.toml | 1 - .../api_endpoints/source_uploader.py | 4 +- .../src/admin_api_lib/apis/admin_api.py | 15 +++--- .../src/admin_api_lib/apis/admin_api_base.py | 9 ++-- .../src/admin_api_lib/impl/admin_api.py | 5 +- .../api_endpoints/default_file_uploader.py | 15 ++---- .../api_endpoints/default_source_uploader.py | 52 ++++++------------- 7 files changed, 35 insertions(+), 66 deletions(-) diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index d7a995f..ec0de57 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -107,7 +107,6 @@ langfuse = "^2.60.4" redis = "^6.0.0" pyyaml = "^6.0.2" python-multipart = "^0.0.20" -requests-toolbelt = "^1.0.0" [tool.pytest.ini_options] log_cli = 1 diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 9cdd59e..3f9c15a 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,8 +1,6 @@ from abc import ABC, abstractmethod -from typing import Optional from pydantic import StrictStr -from fastapi import UploadFile from admin_api_lib.models.key_value_pair import KeyValuePair @@ -13,7 +11,7 @@ class SourceUploader(ABC): async def upload_source( self, base_url: str, - type: StrictStr, + source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], ) -> None: ... diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 4fe1e15..7f3eb1a 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -3,9 +3,7 @@ from typing import Dict, List # noqa: F401 import importlib import pkgutil - -from admin_api_lib.apis.admin_api_base import BaseAdminApi -from fastapi import APIRouter, Path, Request, Response, UploadFile, Form # noqa: F401 +from typing_extensions import Annotated import admin_api_lib.impl @@ -15,6 +13,8 @@ Cookie, Depends, Form, + UploadFile, + Request, Header, HTTPException, Path, @@ -23,15 +23,14 @@ Security, status, ) +from pydantic import Field, StrictStr -from admin_api_lib.models.extra_models import TokenModel # noqa: F401 -from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union -from typing_extensions import Annotated + +from admin_api_lib.apis.admin_api_base import BaseAdminApi from admin_api_lib.models.document_status import DocumentStatus from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair - +from admin_api_lib.models.extra_models import TokenModel # noqa: F401 router = APIRouter() diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index eb5ca84..ee1d0a4 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -1,14 +1,13 @@ # coding: utf-8 from typing import ClassVar, Dict, List, Tuple # noqa: F401 - -from pydantic import Field, StrictBytes, StrictStr -from typing import Any, List, Tuple, Union from typing_extensions import Annotated + +from pydantic import Field, StrictStr +from fastapi import Request, Response, UploadFile + from admin_api_lib.models.document_status import DocumentStatus -from admin_api_lib.models.http_validation_error import HTTPValidationError from admin_api_lib.models.key_value_pair import KeyValuePair -from fastapi import Request, Response, UploadFile class BaseAdminApi: diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index b05d7d7..d2e880a 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -1,10 +1,9 @@ """Module containing the implementation of the Admin API.""" import logging -from typing import List, Optional -from pydantic import Field, StrictBytes, StrictStr +from pydantic import StrictStr from dependency_injector.wiring import Provide, inject from fastapi import Depends, Request, Response, UploadFile @@ -93,7 +92,7 @@ async def upload_source( self, type: StrictStr, name: StrictStr, - kwargs: List[KeyValuePair], + kwargs: list[KeyValuePair], request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 37a8e28..62b6448 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -1,21 +1,17 @@ from http.client import HTTPException import logging -import os from pathlib import Path import traceback -from typing import Optional, Tuple, Union from threading import Thread import urllib import tempfile -from urllib.request import Request +from contextlib import suppress - -from admin_api_lib.file_services.file_service import FileService -from pydantic import StrictBytes, StrictStr from fastapi import UploadFile, status from langchain_core.documents import Document from asyncio import run +from admin_api_lib.file_services.file_service import FileService from admin_api_lib.extractor_api_client.openapi_client.models.extraction_request import ExtractionRequest from admin_api_lib.api_endpoints.file_uploader import FileUploader from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi @@ -109,11 +105,10 @@ async def _handle_source_upload( self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents ] # Replace old document - try: + # deletion is allowed to fail + with suppress(Exception): await self._document_deleter.adelete_document(source_name) - except Exception as e: - # deletion is allowed to fail - pass + self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index ab1e153..f843fa4 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,15 +1,12 @@ from http.client import HTTPException import logging -import os -from pathlib import Path -from typing import Optional, Tuple, Union -from threading import Thread -import urllib -import tempfile -from pydantic import StrictBytes, StrictStr -from fastapi import UploadFile, status -from langchain_core.documents import Document from asyncio import run +from threading import Thread +from contextlib import suppress + +from pydantic import StrictStr +from fastapi import status + from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters @@ -51,18 +48,20 @@ def __init__( async def upload_source( self, base_url: str, - type: StrictStr, + source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], ) -> None: self._background_threads = [t for t in self._background_threads if t.is_alive()] - source_name = f"{type}:{sanitize_document_name(name)}" + source_name = f"{source_type}:{sanitize_document_name(name)}" try: # TODO: check if document already in processing state self._key_value_store.upsert( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status - thread = Thread(target=lambda: run(self._handle_source_upload(source_name, base_url, type, name, kwargs))) + thread = Thread( + target=lambda: run(self._handle_source_upload(source_name, base_url, source_type, name, kwargs)) + ) thread.start() self._background_threads.append(thread) except ValueError as e: @@ -77,13 +76,13 @@ async def _handle_source_upload( self, source_name: str, base_url: str, - type: StrictStr, + source_type: StrictStr, name: str, kwargs: list[KeyValuePair], ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(type=type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) + ExtractionParameters(type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) ) if not information_pieces: @@ -99,32 +98,13 @@ async def _handle_source_upload( ] # Replace old document - try: + # deletion is allowed to fail + with suppress(Exception): await self._document_deleter.adelete_document(source_name) - except Exception as e: - # deletion is allowed to fail - pass + self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - - def _add_file_url( - self, type: StrictStr, file: Optional[UploadFile], base_url: str, chunked_documents: list[Document] - ): - if type != "file": - return - - document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file.name)}" - for idx, chunk in enumerate(chunked_documents): - if chunk.metadata["id"] in chunk.metadata["related"]: - chunk.metadata["related"].remove(chunk.metadata["id"]) - chunk.metadata.update( - { - "chunk": idx, - "chunk_length": len(chunk.page_content), - "document_url": document_url, - } - ) From 0aa4d92d75bf5e7d88bb1202c1756ac77226ec18 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 15:59:48 +0200 Subject: [PATCH 15/56] wip --- admin-api-lib/openapi.yaml | 720 +++++++++--------- .../src/admin_api_lib/apis/admin_api.py | 2 +- .../src/admin_api_lib/apis/admin_api_base.py | 2 +- .../src/admin_api_lib/impl/admin_api.py | 2 +- extractor-api-lib/openapi.yaml | 292 +++---- .../api_endpoints/source_extractor.py | 5 - .../extractor_api_lib/apis/extractor_api.py | 1 - .../apis/extractor_api_base.py | 1 - .../extractors/information_extractor.py | 6 - .../extractors/information_file_extractor.py | 1 - .../api_endpoints/general_file_extractor.py | 5 - .../api_endpoints/general_source_extractor.py | 10 - .../impl/extractor_api_impl.py | 9 +- .../impl/extractors/confluence_extractor.py | 10 +- .../file_extractors/ms_docs_extractor.py | 2 - .../file_extractors/pdf_extractor.py | 1 - .../file_extractors/xml_extractor.py | 1 - ...ce_langchain_document2information_piece.py | 5 +- .../internal2external_information_piece.py | 6 +- 19 files changed, 525 insertions(+), 556 deletions(-) diff --git a/admin-api-lib/openapi.yaml b/admin-api-lib/openapi.yaml index 86d433a..986f445 100644 --- a/admin-api-lib/openapi.yaml +++ b/admin-api-lib/openapi.yaml @@ -1,364 +1,378 @@ openapi: 3.1.0 info: - description: The API is used for the communication between the admin frontend - and the admin backend in the rag project. - title: admin-api-lib - version: 1.0.0 + title: admin-api-lib + version: 1.0.0 + description: >- + The API is used for the communication between the admin frontend and the admin backend in the + rag project. servers: -- url: /api + - + url: /api paths: - /delete_document/{identification}: - delete: - description: |- - Asynchronously deletes a document based on the provided identification. + '/delete_document/{identification}': + delete: + tags: + - admin + parameters: + - + style: simple + explode: false + name: identification + schema: + title: Identification + description: '' + type: string + in: path + required: true + responses: + '200': + content: + application/json: + schema: {} + description: Deleted + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + '500': + description: Internal server error + operationId: delete_document + summary: Delete Document + description: |- + Asynchronously deletes a document based on the provided identification. - Parameters - ---------- - identification : str - The unique identifier of the document to be deleted. + Parameters + ---------- + identification : str + The unique identifier of the document to be deleted. - Returns - ------- - None - operationId: delete_document - parameters: - - explode: false - in: path - name: identification - required: true - schema: - description: "" - title: Identification - type: string - style: simple - responses: - "200": - content: - application/json: - schema: {} - description: Deleted - "422": - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - "500": - description: Internal server error - summary: Delete Document - tags: - - admin - /document_reference/{identification}: - get: - description: |- - Asynchronously retrieve a document reference by its identification. + Returns + ------- + None + '/document_reference/{identification}': + get: + tags: + - admin + parameters: + - + style: simple + explode: false + name: identification + description: Identifier of the document. + schema: + title: Identification + description: Identifier of the document. + type: string + in: path + required: true + responses: + '200': + content: + application/json: + schema: + format: binary + title: Response 200 Document Reference Document Reference Identification Get + type: string + description: Returns the pdf in binary form. + '400': + content: + application/json: + schema: + title: Response 400 Document Reference Document Reference Identification Get + type: string + description: Bad request + '404': + content: + application/json: + schema: + title: Response 404 Document Reference Document Reference Identification Get + type: string + description: Document not found. + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + '500': + content: + application/json: + schema: + title: Response 500 Document Reference Document Reference Identification Get + type: string + description: Internal server error + operationId: document_reference + summary: Document Reference Id Get + description: |- + Asynchronously retrieve a document reference by its identification. - Parameters - ---------- - identification : str - The unique identifier for the document reference. + Parameters + ---------- + identification : str + The unique identifier for the document reference. - Returns - ------- - Response - The response object containing the document reference details. - operationId: document_reference - parameters: - - description: Identifier of the document. - explode: false - in: path - name: identification - required: true - schema: - description: Identifier of the document. - title: Identification - type: string - style: simple - responses: - "200": - content: - application/json: - schema: - format: binary - title: Response 200 Document Reference Document Reference Identification Get - type: string - description: Returns the pdf in binary form. - "400": - content: - application/json: - schema: - title: Response 400 Document Reference Document Reference Identification Get - type: string - description: Bad request - "404": - content: - application/json: - schema: - title: Response 404 Document Reference Document Reference Identification Get - type: string - description: Document not found. - "422": - content: - application/json: - schema: - $ref: '#/components/schemas/HTTPValidationError' - description: Validation Error - "500": - content: - application/json: - schema: - title: Response 500 Document Reference Document Reference Identification Get - type: string - description: Internal server error - summary: Document Reference Id Get - tags: - - admin - /all_documents_status: - get: - description: |- - Asynchronously retrieves the status of all documents. + Returns + ------- + Response + The response object containing the document reference details. + /all_documents_status: + get: + tags: + - admin + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/DocumentStatus' + description: List of document links + '500': + description: Internal server error + operationId: get_all_documents_status + summary: Get All Documents Status + description: |- + Asynchronously retrieves the status of all documents. - Returns - ------- - list[DocumentStatus] - A list containing the status of all documents. - operationId: get_all_documents_status - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/DocumentStatus' - type: array - description: List of document links - "500": - description: Internal server error - summary: Get All Documents Status - tags: - - admin - /upload_file: - post: - description: Uploads user selected sources. - operationId: upload_file - requestBody: - content: - multipart/form-data: - schema: - $ref: '#/components/schemas/Body_upload_file_upload_file_post' - required: true - responses: - "200": - content: - application/json: - schema: {} - description: ok - "400": - description: Bad request - "422": - description: Unprocessable Content - "500": - description: Internal server error - summary: Upload File - tags: - - admin - /upload_source: - post: - description: Uploads user selected sources. - operationId: upload_source - parameters: - - explode: true - in: query - name: type - required: false - schema: - description: "" - title: Type - type: string - style: form - - explode: true - in: query - name: name - required: false - schema: - description: "" - title: Name - type: string - style: form - requestBody: - content: - application/json: - schema: - description: "" - items: - $ref: '#/components/schemas/KeyValuePair' - type: array - responses: - "200": - content: - application/json: - schema: {} - description: ok - "400": - description: Bad request - "422": - description: Unprocessable Content - "500": - description: Internal server error - summary: Upload Source - tags: - - admin + Returns + ------- + list[DocumentStatus] + A list containing the status of all documents. + /upload_file: + post: + requestBody: + content: + multipart/form-data: + schema: + $ref: '#/components/schemas/Body_upload_file_upload_file_post' + required: true + tags: + - admin + responses: + '200': + content: + application/json: + schema: {} + description: ok + '400': + description: Bad request + '422': + description: Unprocessable Content + '500': + description: Internal server error + operationId: upload_file + summary: Upload File + description: Uploads user selected sources. + /upload_source: + post: + requestBody: + content: + application/json: + schema: + description: '' + type: array + items: + $ref: '#/components/schemas/KeyValuePair' + tags: + - admin + parameters: + - + style: form + explode: true + name: source_type + schema: + title: Type + description: '' + type: string + in: query + required: false + - + style: form + explode: true + name: name + schema: + title: Name + description: '' + type: string + in: query + required: false + responses: + '200': + content: + application/json: + schema: {} + description: ok + '400': + description: Bad request + '422': + description: Unprocessable Content + '500': + description: Internal server error + operationId: upload_source + summary: Upload Source + description: Uploads user selected sources. components: - schemas: - Body_upload_file_upload_file_post: - properties: - file: - format: binary - title: File - type: string - required: - - file - title: Body_upload_file_upload_file_post - DocumentStatus: - description: DocumentStatus - example: - name: name - status: UPLOADING - properties: - name: - title: Name - type: string - status: - $ref: '#/components/schemas/Status' - required: - - name - - status - title: DocumentStatus - HTTPValidationError: - description: HTTPValidationError - example: - detail: - - msg: msg - loc: - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - type: type - - msg: msg - loc: - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - type: type - properties: - detail: - items: - $ref: '#/components/schemas/ValidationError' - nullable: true - title: detail - type: array - title: HTTPValidationError - KeyValuePair: - description: KeyValuePair - example: - value: value - key: key - properties: - key: - title: Key - type: string - value: - title: Value - type: string - required: - - key - - value - title: KeyValuePair - Status: - description: allowed enum values - enum: - - UPLOADING - - PROCESSING - - READY - - ERROR - title: Status - type: string - ValidationError: - description: ValidationError - example: - msg: msg - loc: - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - type: type - properties: - loc: - items: - $ref: '#/components/schemas/ValidationErrorLocInner' - title: loc - type: array - msg: - title: Msg - type: string - type: - title: Type - type: string - required: - - loc - - msg - - type - title: ValidationError - ValidationErrorLocInner: - description: ValidationErrorLocInner - example: - anyof_schema_1_validator: anyof_schema_1_validator - actual_instance: "" - any_of_schemas: - - any_of_schemas - - any_of_schemas - anyof_schema_2_validator: 0 - properties: - anyof_schema_1_validator: - nullable: true - title: anyof_schema_1_validator - type: string - anyof_schema_2_validator: - nullable: true - title: anyof_schema_2_validator - type: integer - actual_instance: - title: actual_instance - any_of_schemas: - items: + schemas: + Body_upload_file_upload_file_post: + title: Body_upload_file_upload_file_post + required: + - file + properties: + file: + format: binary + title: File + type: string + DocumentStatus: + title: DocumentStatus + description: DocumentStatus + required: + - name + - status + properties: + name: + title: Name + type: string + status: + $ref: '#/components/schemas/Status' + example: + name: name + status: UPLOADING + HTTPValidationError: + title: HTTPValidationError + description: HTTPValidationError + properties: + detail: + nullable: true + title: detail + type: array + items: + $ref: '#/components/schemas/ValidationError' + example: + detail: + - + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + - + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + KeyValuePair: + title: KeyValuePair + description: KeyValuePair + required: + - key + - value + properties: + key: + title: Key + type: string + value: + title: Value + type: string + example: + value: value + key: key + Status: + title: Status + description: allowed enum values + enum: + - UPLOADING + - PROCESSING + - READY + - ERROR type: string - title: any_of_schemas - type: array - title: ValidationErrorLocInner + ValidationError: + title: ValidationError + description: ValidationError + required: + - loc + - msg + - type + properties: + loc: + title: loc + type: array + items: + $ref: '#/components/schemas/ValidationErrorLocInner' + msg: + title: Msg + type: string + type: + title: Type + type: string + example: + msg: msg + loc: + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + - + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 + type: type + ValidationErrorLocInner: + title: ValidationErrorLocInner + description: ValidationErrorLocInner + properties: + anyof_schema_1_validator: + nullable: true + title: anyof_schema_1_validator + type: string + anyof_schema_2_validator: + nullable: true + title: anyof_schema_2_validator + type: integer + actual_instance: + title: actual_instance + any_of_schemas: + title: any_of_schemas + type: array + items: + type: string + example: + anyof_schema_1_validator: anyof_schema_1_validator + actual_instance: '' + any_of_schemas: + - any_of_schemas + - any_of_schemas + anyof_schema_2_validator: 0 diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 7f3eb1a..ec95b92 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -166,7 +166,7 @@ async def upload_file( ) async def upload_source( request: Request, - type: StrictStr = Query(None, description="", alias="type"), + source_type: StrictStr = Query(None, description="", alias="type"), name: StrictStr = Query(None, description="", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index ee1d0a4..e184692 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -66,7 +66,7 @@ async def get_all_documents_status( async def upload_source( self, - type: StrictStr, + source_type: StrictStr, name: StrictStr, key_value_pair: List[KeyValuePair], request: Request, diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index d2e880a..04cd6df 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -90,7 +90,7 @@ async def get_all_documents_status( @inject async def upload_source( self, - type: StrictStr, + source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], request: Request, diff --git a/extractor-api-lib/openapi.yaml b/extractor-api-lib/openapi.yaml index d178a86..205d208 100644 --- a/extractor-api-lib/openapi.yaml +++ b/extractor-api-lib/openapi.yaml @@ -1,149 +1,153 @@ openapi: 3.0.2 info: - title: extractor-api-lib - version: 1.0.0 + title: extractor-api-lib + version: 1.0.0 servers: -- url: / + - + url: / paths: - /extract_from_file: - post: - operationId: extract_from_file_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/extraction_request' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: List of extracted information. - "422": - description: Body is not a valid PDF. - "500": - description: Something somewhere went terribly wrong. - tags: - - extractor - /extract_from_source: - post: - operationId: extract_from_source - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/extraction_parameters' - required: true - responses: - "200": - content: - application/json: - schema: - items: - $ref: '#/components/schemas/information_piece' - type: array - description: ok - "404": - description: not found - "422": - description: unprocessable entity - "500": - description: internal server error - tags: - - extractor + /extract_from_file: + post: + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_request' + required: true + tags: + - extractor + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/information_piece' + description: List of extracted information. + '422': + description: Body is not a valid PDF. + '500': + description: Something somewhere went terribly wrong. + operationId: extract_from_file_post + /extract_from_source: + post: + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/extraction_parameters' + required: true + tags: + - extractor + responses: + '200': + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/information_piece' + description: ok + '404': + description: not found + '422': + description: unprocessable entity + '500': + description: internal server error + operationId: extract_from_source components: - schemas: - extraction_request: - description: "" - example: - path_on_s3: path on s3 - properties: - path_on_s3: - description: "" - title: PathOnS3 - type: string - document_name: - description: "" - type: string - required: - - document_name - - path_on_s3 - title: ExtractionRequest - type: object - key_value_pair: - description: "" - example: - value: value - key: key - properties: - key: - description: "" - title: Key - value: - description: "" - title: Value - title: MetaInformationPiece - type: object - content_type: - description: "" - enum: - - IMAGE - - TABLE - - TEXT - title: InformationType - type: string - information_piece: - description: A piece of information that has been extracted. - example: - metadata: - - key: key - value: value - - key: key - value: value - page_content: some text - type: TEXT - properties: - metadata: - description: "" - items: - $ref: '#/components/schemas/key_value_pair' - title: MetaInformation - type: array - page_content: - description: "" - type: string - type: - $ref: '#/components/schemas/content_type' - required: - - metadata - - page_content - - type - title: InformationPiece - type: object - extraction_parameters: - description: "" - properties: - document_name: - description: The name that will be used to store the confluence db in the - key value db and the vectordatabase (metadata.document). - title: document_name - type: string - type: - description: Extractortype - title: type - type: string - kwargs: - description: Kwargs for the extractor - items: - $ref: '#/components/schemas/key_value_pair' - title: confluence_kwargs - type: array - required: - - document_name - - type - title: confluence_parameters - type: object + schemas: + extraction_request: + title: ExtractionRequest + description: '' + required: + - document_name + - path_on_s3 + type: object + properties: + path_on_s3: + title: PathOnS3 + description: '' + type: string + document_name: + description: '' + type: string + example: + path_on_s3: path on s3 + key_value_pair: + title: MetaInformationPiece + description: '' + type: object + properties: + key: + title: Key + description: '' + value: + title: Value + description: '' + example: + value: value + key: key + content_type: + title: InformationType + description: '' + enum: + - IMAGE + - TABLE + - TEXT + type: string + information_piece: + title: InformationPiece + description: A piece of information that has been extracted. + required: + - metadata + - page_content + - type + type: object + properties: + metadata: + title: MetaInformation + description: '' + type: array + items: + $ref: '#/components/schemas/key_value_pair' + page_content: + description: '' + type: string + type: + $ref: '#/components/schemas/content_type' + example: + metadata: + - + key: key + value: value + - + key: key + value: value + page_content: some text + type: TEXT + extraction_parameters: + title: confluence_parameters + description: '' + required: + - document_name + - source_type + type: object + properties: + document_name: + title: document_name + description: >- + The name that will be used to store the confluence db in the key value db and the + vectordatabase (metadata.document). + type: string + kwargs: + title: confluence_kwargs + description: Kwargs for the extractor + type: array + items: + $ref: '#/components/schemas/key_value_pair' + source_type: + title: type + description: Extractortype + type: string diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py index 44b5c38..d656367 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -1,12 +1,7 @@ from abc import ABC, abstractmethod -from typing import Optional from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from pydantic import StrictStr -from fastapi import UploadFile - from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair class SourceExtractor(ABC): diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index fc3d0ee..7d09897 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -23,7 +23,6 @@ ) from extractor_api_lib.models.extra_models import TokenModel # noqa: F401 -from typing import Any, List from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index b1bac98..696c60c 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -2,7 +2,6 @@ from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from typing import Any, List from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.models.information_piece import InformationPiece diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py index 35952cf..3a6ee68 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py @@ -1,16 +1,10 @@ """Module for the Base class for Information extractors.""" from abc import ABC, abstractmethod -from typing import Optional from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from fastapi import UploadFile -from pydantic import StrictStr - from extractor_api_lib.impl.types.extractor_types import ExtractorTypes -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece diff --git a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py index e9602d4..7897c19 100644 --- a/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/extractors/information_file_extractor.py @@ -3,7 +3,6 @@ from abc import ABC, abstractmethod from pathlib import Path -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.file_services.file_service import FileService diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py index 8ed9e8b..fee7db2 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py @@ -4,7 +4,6 @@ from pathlib import Path import tempfile import traceback -from typing import Any, List, Optional from extractor_api_lib.api_endpoints.file_extractor import FileExtractor @@ -12,11 +11,7 @@ from extractor_api_lib.models.extraction_request import ExtractionRequest from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor -from extractor_api_lib.extractors.information_extractor import InformationExtractor -from extractor_api_lib.impl.types.extractor_types import ExtractorTypes from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 7e135b6..0c5dbe4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -1,22 +1,12 @@ """Module for the DefaultFileExtractor class.""" import logging -from typing import Optional from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from pydantic import StrictStr -from fastapi import UploadFile - from extractor_api_lib.extractors.information_extractor import InformationExtractor from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor -from extractor_api_lib.impl.mapper.internal2external_information_piece import Internal2ExternalInformationPiece -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair -from extractor_api_lib.impl.types.extractor_types import ExtractorTypes -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece logger = logging.getLogger(__name__) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index 50a8623..276f720 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -1,20 +1,15 @@ """Module for the implementation of the ExtractorApi interface.""" +from fastapi import Depends from dependency_injector.wiring import Provide, inject + from extractor_api_lib.api_endpoints.file_extractor import FileExtractor from extractor_api_lib.api_endpoints.source_extractor import SourceExtractor from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.models.extraction_request import ExtractionRequest -from fastapi import Depends, UploadFile - -from pydantic import StrictStr -from typing import Optional from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair - from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi from extractor_api_lib.dependency_container import DependencyContainer -from extractor_api_lib.models.information_piece import InformationPiece class ExtractorApiImpl(BaseExtractorApi): diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 8b1c07e..3cb55f4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -1,16 +1,10 @@ """Module for the DefaultConfluenceExtractor class.""" -from typing import Optional - -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece -from extractor_api_lib.models.extraction_parameters import ExtractionParameters -from pydantic import StrictStr from langchain_community.document_loaders import ConfluenceLoader -from fastapi import UploadFile from extractor_api_lib.impl.types.extractor_types import ExtractorTypes -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.extraction_parameters import ExtractionParameters from extractor_api_lib.extractors.information_extractor import InformationExtractor from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py index c67425d..5201c62 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py @@ -6,14 +6,12 @@ from typing import Any, Optional import pandas as pd - from unstructured.documents.elements import Element from unstructured.partition.docx import partition_docx from unstructured.partition.pptx import partition_pptx from extractor_api_lib.file_services.file_service import FileService -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py index 8d5bd35..928998f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py @@ -21,7 +21,6 @@ from extractor_api_lib.impl.utils.utils import hash_datetime from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.table_converter.dataframe_converter import DataframeConverter -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py index e7523b6..d72292a 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py @@ -11,7 +11,6 @@ from extractor_api_lib.file_services.file_service import FileService from extractor_api_lib.extractors.information_file_extractor import InformationFileExtractor -from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.impl.types.content_type import ContentType from extractor_api_lib.impl.types.file_type import FileType from extractor_api_lib.impl.utils.utils import hash_datetime diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 85b92bd..a7bcb0d 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -1,12 +1,9 @@ """Module for the ConfluenceLangchainDocument2InformationPiece class.""" -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from langchain_core.documents import Document as LangchainDocument -from extractor_api_lib.models.confluence_parameters import ConfluenceParameters +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.models.content_type import ContentType -from extractor_api_lib.models.information_piece import InformationPiece -from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece class ConfluenceLangchainDocument2InformationPiece: diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index 11f57b4..ee611cb 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -5,9 +5,7 @@ from extractor_api_lib.models.dataclasses.internal_information_piece import ( InternalInformationPiece as InternalInformationPiece, ) -from extractor_api_lib.models.information_piece import ( - InformationPiece as ExternalInformationPiece, -) +from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece @@ -27,7 +25,7 @@ class Internal2ExternalInformationPiece: InternalContentType.TABLE: ExternalContentType.TABLE, } - def map_internal_to_external(self, internal: InternalInformationPiece) -> ExternalInformationPiece: + def map_internal_to_external(self, internal: InternalInformationPiece) -> InformationPiece: """Map an InternalInformationPiece object to an ExternalInformationPiece object. Parameters From 9f99eebfce0525387f41b2899edbf8411e2364e4 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Fri, 16 May 2025 16:01:09 +0200 Subject: [PATCH 16/56] name change --- .../openapi_client/models/extraction_parameters.py | 6 +++--- .../src/extractor_api_lib/models/extraction_parameters.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py index 37db1e8..13ba2ea 100644 --- a/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +++ b/admin-api-lib/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py @@ -30,9 +30,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - type: StrictStr = Field(description="Extractortype") kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") - __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] + source_type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "kwargs", "source_type"] model_config = ConfigDict( populate_by_name=True, @@ -92,12 +92,12 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "type": obj.get("type"), "kwargs": ( [KeyValuePair.from_dict(_item) for _item in obj["kwargs"]] if obj.get("kwargs") is not None else None ), + "source_type": obj.get("source_type"), } ) return _obj diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py index d701978..e18a452 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -34,9 +34,9 @@ class ExtractionParameters(BaseModel): document_name: StrictStr = Field( description="The name that will be used to store the confluence db in the key value db and the vectordatabase (metadata.document)." ) - type: StrictStr = Field(description="Extractortype") kwargs: Optional[List[KeyValuePair]] = Field(default=None, description="Kwargs for the extractor") - __properties: ClassVar[List[str]] = ["document_name", "type", "kwargs"] + source_type: StrictStr = Field(description="Extractortype") + __properties: ClassVar[List[str]] = ["document_name", "kwargs", "source_type"] model_config = { "populate_by_name": True, @@ -94,12 +94,12 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { "document_name": obj.get("document_name"), - "type": obj.get("type"), "kwargs": ( [KeyValuePair.from_dict(_item) for _item in obj.get("kwargs")] if obj.get("kwargs") is not None else None ), + "source_type": obj.get("source_type"), } ) return _obj From 82d27d11d75287d2dc1230e92564011d4f308ee2 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Mon, 19 May 2025 07:51:58 +0200 Subject: [PATCH 17/56] lint --- .../impl/mapper/internal2external_information_piece.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py index ee611cb..6c4d6b8 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py @@ -2,9 +2,7 @@ from extractor_api_lib.impl.types.content_type import ContentType as InternalContentType from extractor_api_lib.models.content_type import ContentType as ExternalContentType -from extractor_api_lib.models.dataclasses.internal_information_piece import ( - InternalInformationPiece as InternalInformationPiece, -) +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.models.information_piece import InformationPiece from extractor_api_lib.models.key_value_pair import KeyValuePair as MetaInformationPiece @@ -40,7 +38,7 @@ def map_internal_to_external(self, internal: InternalInformationPiece) -> Inform """ information_type = self._map_information_type(internal.type) meta = self._map_meta(internal.metadata) - return ExternalInformationPiece(page_content=internal.page_content, type=information_type, metadata=meta) + return InformationPiece(page_content=internal.page_content, type=information_type, metadata=meta) def _map_information_type(self, internal: InternalContentType) -> ExternalContentType: return self.TYPE_LOOKUP_TABLE[internal] From c752478055a1dbb19f3d72d5011c4019033b1922 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Mon, 19 May 2025 08:02:46 +0200 Subject: [PATCH 18/56] reset poetry.lock --- admin-api-lib/poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/admin-api-lib/poetry.lock b/admin-api-lib/poetry.lock index bd12f09..223c2a5 100644 --- a/admin-api-lib/poetry.lock +++ b/admin-api-lib/poetry.lock @@ -3693,4 +3693,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "f34effb5fa2b12b05da69ca28c62764dc2017a2a2a9336b5265428005004e7ec" +content-hash = "99eff6a6ab91512602e8e3094b71bdba096ccf58746d47afd92dff99b24da487" \ No newline at end of file From ee8f3c723f291f984d359c706873424fe6e72a01 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 08:05:41 +0200 Subject: [PATCH 19/56] fix tests --- admin-api-lib/tests/settings/__init__.py | 0 .../settings/confluence_settings_test.py | 108 ------------------ rag-core-api/src/rag_core_api/apis/rag_api.py | 30 +---- .../src/rag_core_api/apis/rag_api_base.py | 8 +- .../src/rag_core_api/models/chat_history.py | 10 +- .../models/chat_history_message.py | 10 +- .../src/rag_core_api/models/chat_request.py | 12 +- .../src/rag_core_api/models/chat_response.py | 12 +- .../src/rag_core_api/models/chat_role.py | 2 +- .../src/rag_core_api/models/content_type.py | 2 +- .../src/rag_core_api/models/delete_request.py | 10 +- .../rag_core_api/models/information_piece.py | 10 +- .../src/rag_core_api/models/key_value_pair.py | 9 +- 13 files changed, 47 insertions(+), 176 deletions(-) delete mode 100644 admin-api-lib/tests/settings/__init__.py delete mode 100644 admin-api-lib/tests/settings/confluence_settings_test.py diff --git a/admin-api-lib/tests/settings/__init__.py b/admin-api-lib/tests/settings/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/tests/settings/confluence_settings_test.py b/admin-api-lib/tests/settings/confluence_settings_test.py deleted file mode 100644 index a98fe7b..0000000 --- a/admin-api-lib/tests/settings/confluence_settings_test.py +++ /dev/null @@ -1,108 +0,0 @@ -import pytest -from admin_api_lib.impl.settings.confluence_settings import ConfluenceSettings -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList - - -def test_default_values(): - # When no settings are provided, all lists default to empty lists. - settings = ConfluenceSettings() - assert settings.url == CommaSeparatedStrList() - assert settings.token == CommaSeparatedStrList() - assert settings.space_key == CommaSeparatedStrList() - assert settings.document_name == CommaSeparatedStrList() - # Bool lists are empty by default if no url is provided. - assert settings.verify_ssl == CommaSeparatedBoolList() - assert settings.include_attachments == CommaSeparatedBoolList() - assert settings.keep_markdown_format == CommaSeparatedBoolList() - assert settings.keep_newlines == CommaSeparatedBoolList() - - -def test_valid_initialization_matching_lengths(): - # Provide all settings with matching lengths. - urls = "http://confluence1, http://confluence2" - tokens = "token1, token2" - space_keys = "SPACE1, SPACE2" - document_names = "Doc1, Doc2" - verify_ssl = "True, False" - include_attachments = "False, True" - keep_markdown_format = "True, True" - keep_newlines = "False, False" - - settings = ConfluenceSettings( - url=urls, - token=tokens, - space_key=space_keys, - document_name=document_names, - verify_ssl=verify_ssl, - include_attachments=include_attachments, - keep_markdown_format=keep_markdown_format, - keep_newlines=keep_newlines, - ) - - # Verify that the comma separated lists have been properly parsed. - assert settings.url == CommaSeparatedStrList(["http://confluence1", "http://confluence2"]) - assert settings.token == CommaSeparatedStrList(["token1", "token2"]) - assert settings.space_key == CommaSeparatedStrList(["SPACE1", "SPACE2"]) - assert settings.document_name == CommaSeparatedStrList(["Doc1", "Doc2"]) - assert settings.verify_ssl == CommaSeparatedBoolList([True, False]) - assert settings.include_attachments == CommaSeparatedBoolList([False, True]) - assert settings.keep_markdown_format == CommaSeparatedBoolList([True, True]) - assert settings.keep_newlines == CommaSeparatedBoolList([False, False]) - - -def test_mismatched_list_lengths(): - # Provide mismatched lengths for comma separated fields, should raise ValueError. - urls = "http://confluence1, http://confluence2, http://confluence3" - tokens = "token1, token2" # shorter than url list - with pytest.raises(ValueError): - ConfluenceSettings( - url=urls, - token=tokens, - space_key="SPACE1, SPACE2, SPACE3", - document_name="Doc1, Doc2, Doc3", - ) - - -def test_default_bool_values_when_missing(): - # Provide only url and leave bool fields empty to see if they are set to defaults. - urls = "http://confluence1, http://confluence2, http://confluence3" - settings = ConfluenceSettings( - url=urls, - token="token1, token2, token3", - space_key="SPACE1, SPACE2, SPACE3", - document_name="Doc1, Doc2, Doc3", - ) - # Defaults for bool fields: verify_ssl True, include_attachments False, - # keep_markdown_format True, keep_newlines True, for each entry. - expected_verify_ssl = CommaSeparatedBoolList([True, True, True]) - expected_include_attachments = CommaSeparatedBoolList([False, False, False]) - expected_keep_markdown_format = CommaSeparatedBoolList([True, True, True]) - expected_keep_newlines = CommaSeparatedBoolList([True, True, True]) - assert settings.verify_ssl == expected_verify_ssl - assert settings.include_attachments == expected_include_attachments - assert settings.keep_markdown_format == expected_keep_markdown_format - assert settings.keep_newlines == expected_keep_newlines - - -def test_bool_fields_not_overwritten_when_provided(): - # Provide bool fields explicitly; they should not be overwritten by defaults. - urls = "http://confluence1, http://confluence2" - settings = ConfluenceSettings( - url=urls, - token="token1, token2", - space_key="SPACE1, SPACE2", - document_name="Doc1, Doc2", - verify_ssl="False, False", - include_attachments="True, True", - keep_markdown_format="False, False", - keep_newlines="False, True", - ) - expected_verify_ssl = CommaSeparatedBoolList([False, False]) - expected_include_attachments = CommaSeparatedBoolList([True, True]) - expected_keep_markdown_format = CommaSeparatedBoolList([False, False]) - expected_keep_newlines = CommaSeparatedBoolList([False, True]) - assert settings.verify_ssl == expected_verify_ssl - assert settings.include_attachments == expected_include_attachments - assert settings.keep_markdown_format == expected_keep_markdown_format - assert settings.keep_newlines == expected_keep_newlines diff --git a/rag-core-api/src/rag_core_api/apis/rag_api.py b/rag-core-api/src/rag_core_api/apis/rag_api.py index fb432c6..dda92db 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api.py @@ -3,16 +3,12 @@ # coding: utf-8 # flake8: noqa: D105 -from asyncio import FIRST_COMPLETED, CancelledError, create_task, wait -from contextlib import suppress -import logging -from time import sleep -from typing import Dict, List # noqa: F401 import importlib +import logging import pkgutil - -from rag_core_api.apis.rag_api_base import BaseRagApi -import openapi_server.impl +from asyncio import FIRST_COMPLETED, CancelledError, create_task, sleep, wait +from contextlib import suppress +from typing import Any, Awaitable, List # noqa: F401 from fastapi import ( # noqa: F401 APIRouter, @@ -33,17 +29,11 @@ import rag_core_api.impl from rag_core_api.apis.rag_api_base import BaseRagApi -from rag_core_api.models.extra_models import TokenModel # noqa: F401 -from pydantic import Field, StrictStr -from typing import Any, List -import logging -from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest from rag_core_api.models.information_piece import InformationPiece - logger = logging.getLogger(__name__) router = APIRouter() @@ -74,10 +64,8 @@ async def _disconnected(request: Request) -> None: ) async def chat( request: Request, - session_id: StrictStr = Path(..., description=""), - chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")] = Body( - None, description="Chat with RAG." - ), + session_id: str = Path(..., description=""), + chat_request: ChatRequest = Body(None, description="Chat with RAG."), ) -> ChatResponse | None: """ Asynchronously handles the chat endpoint for the RAG API. @@ -141,8 +129,6 @@ async def evaluate() -> None: ------- None """ - if not BaseRagApi.subclasses: - raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().evaluate() @@ -175,8 +161,6 @@ async def remove_information_piece( ------- None """ - if not BaseRagApi.subclasses: - raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().remove_information_piece(delete_request) @@ -208,6 +192,4 @@ async def upload_information_piece( ------- None """ - if not BaseRagApi.subclasses: - raise HTTPException(status_code=500, detail="Not implemented") return await BaseRagApi.subclasses[0]().upload_information_piece(information_piece) diff --git a/rag-core-api/src/rag_core_api/apis/rag_api_base.py b/rag-core-api/src/rag_core_api/apis/rag_api_base.py index 0b53f4b..615230d 100644 --- a/rag-core-api/src/rag_core_api/apis/rag_api_base.py +++ b/rag-core-api/src/rag_core_api/apis/rag_api_base.py @@ -2,11 +2,9 @@ # coding: utf-8 # flake8: noqa: D105 + from typing import ClassVar, Dict, List, Tuple # noqa: F401 -from pydantic import Field, StrictStr -from typing import Any, List -from typing_extensions import Annotated from rag_core_api.models.chat_request import ChatRequest from rag_core_api.models.chat_response import ChatResponse from rag_core_api.models.delete_request import DeleteRequest @@ -33,8 +31,8 @@ def __init_subclass__(cls, **kwargs): async def chat( self, - session_id: StrictStr, - chat_request: Annotated[ChatRequest, Field(description="Chat with RAG.")], + session_id: str, + chat_request: ChatRequest, ) -> ChatResponse: """ Asynchronously handles the chat endpoint for the RAG API. diff --git a/rag-core-api/src/rag_core_api/models/chat_history.py b/rag-core-api/src/rag_core_api/models/chat_history.py index 9087afe..5980dca 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history.py +++ b/rag-core-api/src/rag_core_api/models/chat_history.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.chat_history_message import ChatHistoryMessage try: @@ -46,8 +47,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/chat_history_message.py b/rag-core-api/src/rag_core_api/models/chat_history_message.py index c9d782b..c664092 100644 --- a/rag-core-api/src/rag_core_api/models/chat_history_message.py +++ b/rag-core-api/src/rag_core_api/models/chat_history_message.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, StrictStr -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.chat_role import ChatRole try: @@ -47,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/chat_request.py b/rag-core-api/src/rag_core_api/models/chat_request.py index 66090ef..1e0b135 100644 --- a/rag-core-api/src/rag_core_api/models/chat_request.py +++ b/rag-core-api/src/rag_core_api/models/chat_request.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List, Optional from pydantic import BaseModel, ConfigDict, StrictStr -from typing import Any, ClassVar, Dict, List, Optional + from rag_core_api.models.chat_history import ChatHistory try: @@ -47,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: @@ -86,7 +86,7 @@ def from_dict(cls, obj: Dict) -> Self: _obj = cls.model_validate( { - "history": ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None, + "history": (ChatHistory.from_dict(obj.get("history")) if obj.get("history") is not None else None), "message": obj.get("message"), } ) diff --git a/rag-core-api/src/rag_core_api/models/chat_response.py b/rag-core-api/src/rag_core_api/models/chat_response.py index ba8c6b1..a0fcf44 100644 --- a/rag-core-api/src/rag_core_api/models/chat_response.py +++ b/rag-core-api/src/rag_core_api/models/chat_response.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, Field, StrictStr -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.information_piece import InformationPiece try: @@ -48,8 +49,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: @@ -94,7 +94,7 @@ def from_dict(cls, obj: Dict) -> Self: "answer": obj.get("answer"), "finish_reason": obj.get("finish_reason"), "citations": ( - [InformationPiece.from_dict(_item) for _item in obj.get("citations")] + [SourceDocument.from_dict(_item) for _item in obj.get("citations")] if obj.get("citations") is not None else None ), diff --git a/rag-core-api/src/rag_core_api/models/chat_role.py b/rag-core-api/src/rag_core_api/models/chat_role.py index 7e1c88d..cd2ff17 100644 --- a/rag-core-api/src/rag_core_api/models/chat_role.py +++ b/rag-core-api/src/rag_core_api/models/chat_role.py @@ -13,12 +13,12 @@ from __future__ import annotations + import json import pprint import re # noqa: F401 from enum import Enum - try: from typing import Self except ImportError: diff --git a/rag-core-api/src/rag_core_api/models/content_type.py b/rag-core-api/src/rag_core_api/models/content_type.py index 7f4d874..3d39928 100644 --- a/rag-core-api/src/rag_core_api/models/content_type.py +++ b/rag-core-api/src/rag_core_api/models/content_type.py @@ -13,12 +13,12 @@ from __future__ import annotations + import json import pprint import re # noqa: F401 from enum import Enum - try: from typing import Self except ImportError: diff --git a/rag-core-api/src/rag_core_api/models/delete_request.py b/rag-core-api/src/rag_core_api/models/delete_request.py index 8b40339..797dcf2 100644 --- a/rag-core-api/src/rag_core_api/models/delete_request.py +++ b/rag-core-api/src/rag_core_api/models/delete_request.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List, Optional from pydantic import BaseModel, ConfigDict -from typing import Any, ClassVar, Dict, List, Optional + from rag_core_api.models.key_value_pair import KeyValuePair try: @@ -46,8 +47,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/information_piece.py b/rag-core-api/src/rag_core_api/models/information_piece.py index dfe8a42..b85092f 100644 --- a/rag-core-api/src/rag_core_api/models/information_piece.py +++ b/rag-core-api/src/rag_core_api/models/information_piece.py @@ -13,13 +13,14 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, Field, StrictStr -from typing import Any, ClassVar, Dict, List + from rag_core_api.models.content_type import ContentType from rag_core_api.models.key_value_pair import KeyValuePair @@ -53,8 +54,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/rag-core-api/src/rag_core_api/models/key_value_pair.py b/rag-core-api/src/rag_core_api/models/key_value_pair.py index 3079959..abf0986 100644 --- a/rag-core-api/src/rag_core_api/models/key_value_pair.py +++ b/rag-core-api/src/rag_core_api/models/key_value_pair.py @@ -13,13 +13,13 @@ from __future__ import annotations + +import json import pprint import re # noqa: F401 -import json - +from typing import Any, ClassVar, Dict, List from pydantic import BaseModel, ConfigDict, Field, StrictStr -from typing import Any, ClassVar, Dict, List try: from typing import Self @@ -48,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: From ef8dd202c5f94ffe1cfdfc7bdd9272b9d2dd2df9 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 09:07:35 +0200 Subject: [PATCH 20/56] update doc for admin api --- README.md | 15 ++++---- .../api_endpoints/file_uploader.py | 17 ++++++++- .../api_endpoints/source_uploader.py | 23 ++++++++++- .../api_endpoints/default_file_uploader.py | 38 ++++++++++++++++++- .../api_endpoints/default_source_uploader.py | 38 +++++++++++++++++++ 5 files changed, 120 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 13d9231..159833c 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,8 @@ The following endpoints are provided by the *admin-api-lib*: - `/delete_document/{identification}`: Deletes the file from storage (if applicable) and vector database. The `identification` can be retrieved from the `/all_documents_status` endpoint. - `/document_reference/{identification}`: Returns the document. - `/all_documents_status`: Return the `identification` and status of all available sources. -- `/upload_documents`: Endpoint to upload files. -- `/load_confluence`: Endpoint to load a confluence space +- `/upload_file`: Endpoint to upload files. +- `/upload_source`: Endpoint to upload non-file sources. ### 2.1 Requirements @@ -135,14 +135,15 @@ Will return the source document stored in the connected storage system. Will return a list of all sources for the chat and their current status. -#### `/upload_documents` +#### `/upload_file` Files can be uploaded here. This endpoint will process the document in a background and will extract information using the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using a LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). -#### `/load_confluence` +#### `/upload_source` -Loads all the content of a confluence space using the [document-extractor](#3-extractor-api-lib). +Loads all the content from an abritrary non-file source using the [document-extractor](#3-extractor-api-lib). +The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). ### 2.3 Replaceable parts @@ -162,9 +163,9 @@ The extracted information will be summarized using LLM. The summary, as well as | information_enhancer | [`rag_core_lib.chains.async_chain.AsyncChain[Any, Any]`](./rag-core-lib/src/rag_core_lib/chains/async_chain.py)| [`rag_core_lib.impl.tracers.langfuse_traced_chain.LangfuseTracedGraph`](./rag-core-lib/src/rag_core_lib/impl/tracers/langfuse_traced_chain.py) |Wraps around the *untraced_information_enhancer* and adds langfuse tracing. | | document_deleter |[`admin_api_lib.api_endpoints.document_deleter.DocumentDeleter`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py) | [`admin_api_lib.impl.api_endpoints.default_document_deleter.DefaultDocumentDeleter`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py) | Handles deletion of sources. | | documents_status_retriever | [`admin_api_lib.api_endpoints.documents_status_retriever.DocumentsStatusRetriever`](./admin-api-lib/src/admin_api_lib/api_endpoints/documents_status_retriever.py) | [`admin_api_lib.impl.api_endpoints.default_documents_status_retriever.DefaultDocumentsStatusRetriever`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py) |Handles return of source status. | -| confluence_loader | [`admin_api_lib.api_endpoints.confluence_loader.ConfluenceLoader`](./admin-api-lib/src/admin_api_lib/api_endpoints/confluence_loader.py) | [`admin_api_lib.impl.api_endpoints.default_confluence_loader.DefaultConfluenceLoader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_confluence_loader.py)| Handles data loading and extraction from confluence. | +| source_uploader | [`admin_api_lib.api_endpoints.source_uploader.SourceUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_source_uploader.DefaultSourceUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py)| Handles data loading and extraction from various non-file sources. | | document_reference_retriever | [`admin_api_lib.api_endpoints.document_reference_retriever.DocumentReferenceRetriever`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_reference_retriever.py) | [`admin_api_lib.impl.api_endpoints.default_document_reference_retriever.DefaultDocumentReferenceRetriever`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py) | Handles return of files from connected storage. | -| document_uploader | [`admin_api_lib.api_endpoints.document_uploader.DocumentUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/document_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_document_uploader.DefaultDocumentUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_uploader.py) | Handles upload and extraction of files. | +| file_uploader | [`admin_api_lib.api_endpoints.file_uploader.FileUploader`](./admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py) | [`admin_api_lib.impl.api_endpoints.default_file_uploader.DefaultFileUploader`](./admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py) | Handles upload and extraction of files. | ## 3. Extractor API Lib diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index 2a33545..3dad40c 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,3 +1,4 @@ +"""Module for the upload file endpoint.""" from abc import ABC, abstractmethod from fastapi import UploadFile @@ -10,4 +11,18 @@ async def upload_file( self, base_url: str, file: UploadFile, - ) -> None: ... + ) -> None: + """ + Uploads a source file for content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the file. + file : UploadFile + The file to process. + + Returns + ------- + None + """ \ No newline at end of file diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 3f9c15a..f135b54 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,3 +1,4 @@ +"""Module for the upload source endpoint.""" from abc import ABC, abstractmethod from pydantic import StrictStr @@ -6,7 +7,7 @@ class SourceUploader(ABC): - + """Abstract base class for source upload.""" @abstractmethod async def upload_source( self, @@ -14,4 +15,22 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - ) -> None: ... + ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the source. + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + + Returns + ------- + None + """ \ No newline at end of file diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 62b6448..80db150 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -28,7 +28,7 @@ class DefaultFileUploader(FileUploader): - + """The DefaultFileUploader is responsible for adding a new source file document to the available content.""" def __init__( self, extractor_api: ExtractorApi, @@ -40,6 +40,28 @@ def __init__( information_mapper: InformationPiece2Document, file_service: FileService, ): + """ + Initialize the DefaultFileUploader. + + Parameters + ---------- + extractor_api : ExtractorApi + Client for the Extraction service. + key_value_store : FileStatusKeyValueStore + The key-value store for storing filename and the corresponding status. + information_enhancer : InformationEnhancer + The service for enhancing information. + chunker : Chunker + The service for chunking documents into chunks. + document_deleter : DocumentDeleter + The service for deleting documents. + rag_api : RagApi + The API for RAG backend. + information_mapper : InformationPiece2Document + The mapper for converting information pieces to langchain documents. + file_service : FileService + The service for handling file operations on the S3 storage + """ self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -55,6 +77,20 @@ async def upload_file( base_url: str, file: UploadFile, ) -> None: + """ + Uploads a source file for content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the file. + file : UploadFile + The file to process. + + Returns + ------- + None + """ self._background_threads = [t for t in self._background_threads if t.is_alive()] try: diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index f843fa4..db9fe6c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -36,6 +36,26 @@ def __init__( rag_api: RagApi, information_mapper: InformationPiece2Document, ): + """ + Initialize the DefaultSourceUploader. + + Parameters + ---------- + extractor_api : ExtractorApi + Client for the Extraction service. + key_value_store : FileStatusKeyValueStore + The key-value store for storing filename and the corresponding status. + information_enhancer : InformationEnhancer + The service for enhancing information. + chunker : Chunker + The service for chunking documents into chunks. + document_deleter : DocumentDeleter + The service for deleting documents. + rag_api : RagApi + The API for RAG backend. + information_mapper : InformationPiece2Document + The mapper for converting information pieces to langchain documents. + """ self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -52,6 +72,24 @@ async def upload_source( name: StrictStr, kwargs: list[KeyValuePair], ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + base_url : str + The base url of the service. Is used to determine the download link of the source. + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + + Returns + ------- + None + """ self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" try: From a86f76c2b9851cc92c54574d6b6d5ce44c4e4598 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 09:08:04 +0200 Subject: [PATCH 21/56] black --- .../src/admin_api_lib/api_endpoints/file_uploader.py | 5 +++-- .../src/admin_api_lib/api_endpoints/source_uploader.py | 6 ++++-- .../impl/api_endpoints/default_file_uploader.py | 1 + .../impl/api_endpoints/default_source_uploader.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index 3dad40c..b8594c7 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,4 +1,5 @@ """Module for the upload file endpoint.""" + from abc import ABC, abstractmethod from fastapi import UploadFile @@ -11,7 +12,7 @@ async def upload_file( self, base_url: str, file: UploadFile, - ) -> None: + ) -> None: """ Uploads a source file for content extraction. @@ -25,4 +26,4 @@ async def upload_file( Returns ------- None - """ \ No newline at end of file + """ diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index f135b54..f4b4e03 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,4 +1,5 @@ """Module for the upload source endpoint.""" + from abc import ABC, abstractmethod from pydantic import StrictStr @@ -8,6 +9,7 @@ class SourceUploader(ABC): """Abstract base class for source upload.""" + @abstractmethod async def upload_source( self, @@ -15,7 +17,7 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - ) -> None: + ) -> None: """ Uploads the parameters for source content extraction. @@ -33,4 +35,4 @@ async def upload_source( Returns ------- None - """ \ No newline at end of file + """ diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 80db150..b9b367f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -29,6 +29,7 @@ class DefaultFileUploader(FileUploader): """The DefaultFileUploader is responsible for adding a new source file document to the available content.""" + def __init__( self, extractor_api: ExtractorApi, diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index db9fe6c..4fc7ff3 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -54,7 +54,7 @@ def __init__( rag_api : RagApi The API for RAG backend. information_mapper : InformationPiece2Document - The mapper for converting information pieces to langchain documents. + The mapper for converting information pieces to langchain documents. """ self._extractor_api = extractor_api self._rag_api = rag_api From acde7e53e541fce52a524e344cab260b45c27284 Mon Sep 17 00:00:00 2001 From: Melvin Klein Date: Tue, 20 May 2025 09:29:26 +0200 Subject: [PATCH 22/56] extractor comments --- README.md | 13 ++++++++----- .../api_endpoints/file_extractor.py | 2 +- .../api_endpoints/source_extractor.py | 1 + .../impl/api_endpoints/general_source_extractor.py | 10 ++++++---- .../impl/extractors/confluence_extractor.py | 6 +++--- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 159833c..7becbcd 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ This API should not be exposed by ingress and only used for internally. The following endpoints are provided by the *extractor-api-lib*: - `/extract_from_file`: This endpoint extracts the information from files. -- `/extract_from_confluence`: This endpoint extracts the information from a confluence space. +- `/extract_from_source`: This endpoint extracts the information from a non-file source. ### 3.1 Requirements @@ -203,12 +203,14 @@ The following types of information will be extracted: - `TEXT`: plain text - `TABLE`: data in tabular form found in the document -#### `/extract_from_confluence` +#### `/extract_from_source` -The extract from confluence endpoint will extract the information from a confluence space. -The following types of information will be extracted: +This endpoint will extract data for non-file source. +The type of information that is extracted will vary depending on the source, the following types of information can be extracted: - `TEXT`: plain text +- `TABLE`: data in tabular form found in the document +- `IMAGE`: data in tabular form found in the document ### 3.3 Replaceable parts @@ -222,7 +224,8 @@ The following types of information will be extracted: | all_extractors | `dependency_injector.providers.List[extractor_api_lib.document_parser.information_extractor.InformationExtractor]` | `dependency_injector.providers.List(pdf_extractor, ms_docs_extractor, xml_extractor)` | List of all available extractors. If you add a new type of extractor you would have to add it to this list. | | general_extractor | [`extractor_api_lib.document_parser.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/document_parser/information_extractor.py) |[`extractor_api_lib.document_parser.general_extractor.GeneralExtractor`](./extractor-api-lib/src/extractor_api_lib/document_parser/general_extractor.py) | Combines multiple extractors and decides which one to use for the given file format. | | file_extractor | [`extractor_api_lib.api_endpoints.file_extractor.FileExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_file_extractor.DefaultFileExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py) | Implementation of the `/extract_from_file` endpoint. Uses *general_extractor*. | -| confluence_extractor | [`extractor_api_lib.api_endpoints.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/confluence_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_confluence_extractor.DefaultConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_confluence_extractor.py) | Implementation of the `/extract_from_confluence` endpoint. | +| general_source_extractor | [`extractor_api_lib.api_endpoints.source_extractor.SourceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py) | [`extractor_api_lib.impl.api_endpoints.general_source_extractor.GeneralSourceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py) | Implementation of the `/extract_from_source` endpoint. Will decide the correct extractor for the source. | +| confluence_extractor | [`extractor_api_lib.extractors.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py) | [`extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/confluence_extractor.py) | Implementation of an esxtractor for the source `confluence`. | ## 4. RAG Core Lib diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py index ad968a2..2c9a645 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py @@ -4,7 +4,7 @@ class FileExtractor(ABC): - """Abstract base class for extract_information endpoint.""" + """Abstract base class for extract__from_file endpoint.""" @abstractmethod async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]: diff --git a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py index d656367..4071322 100644 --- a/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py @@ -5,6 +5,7 @@ class SourceExtractor(ABC): + """Abstract base class for extract_from_source endpoint.""" @abstractmethod async def aextract_information( diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 0c5dbe4..70bfab8 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -28,6 +28,8 @@ def __init__(self, available_extractors: list[InformationExtractor], mapper: Int ---------- available_extractors : list of InformationExtractor A list of available information extractors to be used by the GeneralExtractor. + mapper : Internal2ExternalInformationPiece + Mapper for mapping the internal represantation to the external one. """ self._mapper = mapper self._available_extractors = available_extractors @@ -37,17 +39,17 @@ async def aextract_information( extraction_parameters: ExtractionParameters, ) -> list[InformationPiece]: """ - Extract content from given file. + Extract information from source, using the given parameters. Parameters ---------- - file_path : Path - Path to the file the information should be extracted from. + extraction_parameters : ExtractionParameters + The parameters used to extract information from the source. Returns ------- list[InformationPiece] - The extracted information. + A list of extracted information pieces. """ correct_extractors = [x for x in self._available_extractors if extraction_parameters.type == x.extractor_type] if not correct_extractors: diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 3cb55f4..f1c15a6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -12,14 +12,14 @@ class ConfluenceExtractor(InformationExtractor): - """Default implementation of the FileExtractor interface.""" + """Implementation of the InformationExtractor interface for confluence.""" def __init__( self, mapper: ConfluenceLangchainDocument2InformationPiece, ): """ - Initialize the DefaultConfluenceExtractor. + Initialize the ConfluenceExtractor. Parameters ---------- @@ -42,7 +42,7 @@ async def aextract_content( Parameters ---------- - confluence_parameters : ConfluenceParameters + extraction_parameters : ExtractionParameters The parameters required to connect to and extract data from Confluence. Returns From 5e6ca9ba2911e6d21150f9b0b8164f8699749747 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 10:13:16 +0200 Subject: [PATCH 23/56] fix: minor bugs --- admin-api-lib/src/admin_api_lib/apis/admin_api.py | 2 +- admin-api-lib/src/admin_api_lib/impl/admin_api.py | 2 +- .../impl/api_endpoints/default_source_uploader.py | 2 +- .../impl/api_endpoints/general_source_extractor.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index ec95b92..fc8d867 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -173,4 +173,4 @@ async def upload_source( """Uploads user selected sources.""" if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(type, name, key_value_pair, request) + return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair, request) diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 04cd6df..fbc62eb 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -96,7 +96,7 @@ async def upload_source( request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: - await source_uploader.upload_source(str(request.base_url), type, name, kwargs) + await source_uploader.upload_source(str(request.base_url), source_type, name, kwargs) @inject async def upload_file( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 4fc7ff3..2770553 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -120,7 +120,7 @@ async def _handle_source_upload( ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) + ExtractionParameters(source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) ) if not information_pieces: diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 70bfab8..8e08ad6 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -51,8 +51,8 @@ async def aextract_information( list[InformationPiece] A list of extracted information pieces. """ - correct_extractors = [x for x in self._available_extractors if extraction_parameters.type == x.extractor_type] + correct_extractors = [x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type] if not correct_extractors: - raise ValueError(f"No extractor found for type {type}") + raise ValueError(f"No extractor found for type {extraction_parameters.source_type}") results = await correct_extractors[-1].aextract_content(extraction_parameters) return [self._mapper.map_internal_to_external(x) for x in results if x.page_content is not None] From c5c537b8a2be2a1f8c669c9ec8bbb3a812375e02 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 14:35:26 +0200 Subject: [PATCH 24/56] refactor: remove unused utility modules and tests --- .../src/admin_api_lib/impl/utils/__init__.py | 0 .../impl/utils/comma_separated_bool_list.py | 65 ---------------- .../impl/utils/comma_separated_str_list.py | 74 ------------------- .../tests/comma_separated_bool_list_test.py | 55 -------------- .../tests/comma_separated_str_list_test.py | 49 ------------ admin-api-lib/tests/dummy_test.py | 3 + 6 files changed, 3 insertions(+), 243 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/impl/utils/__init__.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py delete mode 100644 admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py delete mode 100644 admin-api-lib/tests/comma_separated_bool_list_test.py delete mode 100644 admin-api-lib/tests/comma_separated_str_list_test.py create mode 100644 admin-api-lib/tests/dummy_test.py diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/__init__.py b/admin-api-lib/src/admin_api_lib/impl/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py b/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py deleted file mode 100644 index df23553..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_bool_list.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Utility module to handle comma separated string input that represents boolean values.""" - -from typing import Any - - -class CommaSeparatedBoolList(list): - """ - A subclass of list that converts comma-separated strings or lists into a list of booleans. - - Notes - ----- - - For string inputs, splits the string by commas and converts recognized true values ("true", "1", "yes") to True. - - An empty or whitespace-only string returns an empty list. - - For list inputs, each element is converted to a boolean. - """ - - @classmethod - def validate(cls, v: Any, info) -> list[bool]: - """ - Validate and convert the input into a list of booleans. - - Parameters - ---------- - v : Any - Input value, either a comma separated string or a list. - info : Any - Additional context information (unused). - - Returns - ------- - list of bool - List of booleans parsed from the input. An empty string returns an empty list. - - Raises - ------ - ValueError - If v is not a string or list. - """ - - def str_to_bool(s: str) -> bool: - return s.lower() in ("true", "1", "yes") - - if isinstance(v, str): - if v.strip() == "": - return [] - return [str_to_bool(item.strip()) for item in v.split(",") if item.strip()] - elif isinstance(v, list): - return [bool(item) for item in v] - raise ValueError("Not a valid comma separated boolean list") - - @classmethod - def __get_validators__(cls): - """ - Get validator functions for Pydantic to use with this data type. - - This method is called by Pydantic during model initialization to collect - validator functions for fields using this custom data type. - - Returns - ------- - generator - A generator yielding validator functions, specifically `cls.validate`, - which will be applied to validate and convert input values. - """ - yield cls.validate diff --git a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py b/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py deleted file mode 100644 index 7b3a2a9..0000000 --- a/admin-api-lib/src/admin_api_lib/impl/utils/comma_separated_str_list.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Comma Separated String List Utility Module. - -This module provides a custom list type to validate and convert inputs into -a list of strings. It splits comma separated strings and converts list elements -to strings. - -Raises ------- -ValueError - If the provided input is neither a string nor a list. -""" - -from typing import Any - - -class CommaSeparatedStrList(list): - """ - Custom list type that validates comma separated strings. - - - If input is a string: splits by commas and strips whitespace. - - If input is a list: converts all elements to strings. - - Raises - ------ - ValueError - For invalid input type. - """ - - @classmethod - def validate(cls, v: Any, info) -> list[str]: - """ - Convert input to a validated list of strings. - - Parameters - ---------- - v : Any - A comma-separated string or a list containing items to be converted. - info : Any - Additional contextual information (not used in current implementation). - - Returns - ------- - list of str - A list of trimmed strings. Returns an empty list for an empty or whitespace-only string. - - Raises - ------ - ValueError - If the input v is neither a string nor a list. - """ - if isinstance(v, str): - if v.strip() == "": - return [] - return [item.strip() for item in v.split(",") if item.strip()] - elif isinstance(v, list): - return [str(item) for item in v] - raise ValueError("Not a valid comma separated string list") - - @classmethod - def __get_validators__(cls): - """ - Get validator functions for Pydantic to use with this data type. - - This method is called by Pydantic during model initialization to collect - validator functions for fields using this custom data type. - - Returns - ------- - generator - A generator yielding validator functions, specifically `cls.validate`, - which will be applied to validate and convert input values. - """ - yield cls.validate diff --git a/admin-api-lib/tests/comma_separated_bool_list_test.py b/admin-api-lib/tests/comma_separated_bool_list_test.py deleted file mode 100644 index d6a72d3..0000000 --- a/admin-api-lib/tests/comma_separated_bool_list_test.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest -from admin_api_lib.impl.utils.comma_separated_bool_list import CommaSeparatedBoolList - - -def test_validate_empty_string(): - # An empty string should return an empty list. - assert CommaSeparatedBoolList.validate("", None) == [] - - -def test_validate_string_input(): - # Test a typical comma separated string. - # "true", "yes", and "1" are considered True, all others are False. - input_str = "true, false, yes, no, 1, 0, ,TRUE, YeS" - expected = [ - True, # "true" - False, # "false" - True, # "yes" - False, # "no" - True, # "1" - False, # "0" - True, # "TRUE" - True, # "YeS" - ] - # Note: extra whitespace items are ignored. - result = CommaSeparatedBoolList.validate(input_str, None) - assert result == expected - - -def test_validate_string_with_extra_commas(): - # Test string with extra commas and spaces. - input_str = "true,, yes, ,false" - expected = [True, True, False] - result = CommaSeparatedBoolList.validate(input_str, None) - assert result == expected - - -def test_validate_list_input(): - # When input is a list, each element is cast to bool. - input_list = [0, 1, True, False, "non-empty", ""] - expected = [ - False, # bool(0) - True, # bool(1) - True, # bool(True) - False, # bool(False) - True, # bool("non-empty") - False, # bool("") - ] - result = CommaSeparatedBoolList.validate(input_list, None) - assert result == expected - - -def test_invalid_input_type(): - # Passing a non-string and non-list should raise a ValueError. - with pytest.raises(ValueError): - CommaSeparatedBoolList.validate(123, None) diff --git a/admin-api-lib/tests/comma_separated_str_list_test.py b/admin-api-lib/tests/comma_separated_str_list_test.py deleted file mode 100644 index a86c048..0000000 --- a/admin-api-lib/tests/comma_separated_str_list_test.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest -from admin_api_lib.impl.utils.comma_separated_str_list import CommaSeparatedStrList - - -def test_validate_string(): - # simple comma separated string - input_str = "a, b, c" - expected = ["a", "b", "c"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - input_str = "a" - expected = ["a"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_string_with_extra_spaces(): - # string with extra spaces and empty items - input_str = " apple , banana , , cherry , " - expected = ["apple", "banana", "cherry"] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_empty_string(): - input_str = "" - expected = [] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_string_only_spaces(): - input_str = " " - expected = [] - result = CommaSeparatedStrList.validate(input_str, None) - assert result == expected - - -def test_validate_list(): - input_list = [1, "2", 3.0, " test "] - expected = ["1", "2", "3.0", " test "] - result = CommaSeparatedStrList.validate(input_list, None) - assert result == expected - - -def test_invalid_input_type(): - with pytest.raises(ValueError): - CommaSeparatedStrList.validate(12345, None) diff --git a/admin-api-lib/tests/dummy_test.py b/admin-api-lib/tests/dummy_test.py new file mode 100644 index 0000000..1428394 --- /dev/null +++ b/admin-api-lib/tests/dummy_test.py @@ -0,0 +1,3 @@ +def test_dummy() -> None: + print("Dummy test.") + assert True From 0133c00d2847990c936e966d1d1d233bf1a5a862 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 15:06:03 +0200 Subject: [PATCH 25/56] docs: enhance module docstrings and method descriptions across the admin and extractor APIs --- .../src/admin_api_lib/apis/admin_api.py | 2 + .../src/admin_api_lib/apis/admin_api_base.py | 36 ++++++++++++++-- .../src/admin_api_lib/impl/admin_api.py | 36 ++++++++++++++++ .../api_endpoints/default_source_uploader.py | 7 +--- .../admin_api_lib/models/document_status.py | 5 +-- .../models/http_validation_error.py | 4 +- .../admin_api_lib/models/key_value_pair.py | 3 +- .../src/admin_api_lib/models/status.py | 2 +- .../admin_api_lib/models/validation_error.py | 3 +- .../models/validation_error_loc_inner.py | 3 +- .../extractor_api_lib/apis/extractor_api.py | 33 +++++++++++++++ .../apis/extractor_api_base.py | 41 ++++++++++++++++++- 12 files changed, 152 insertions(+), 23 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index fc8d867..5a332be 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -1,3 +1,5 @@ +"""Module for the Admin API.""" + # coding: utf-8 from typing import Dict, List # noqa: F401 diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index e184692..432c457 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -1,4 +1,7 @@ +"""Module for the base AdminApi interface.""" + # coding: utf-8 +# flake8: noqa: D105 from typing import ClassVar, Dict, List, Tuple # noqa: F401 from typing_extensions import Annotated @@ -11,6 +14,15 @@ class BaseAdminApi: + """ + The base AdminApi interface. + + Attributes + ---------- + subclasses : ClassVar[Tuple] + A tuple that holds all subclasses of BaseAdminApi. + """ + subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): @@ -71,12 +83,30 @@ async def upload_source( key_value_pair: List[KeyValuePair], request: Request, ) -> None: - """Uploads user selected source.""" + """ + Asynchronously uploads user selected source. + + Returns + ------- + None + """ async def upload_file( self, file: UploadFile, request: Request, ) -> None: - """Uploads user selected file.""" - ... + """ + Asynchronously uploads user-selected documents. + + Parameters + ---------- + file : UploadFile + The file object containing the source documents to be uploaded. + request : Request + The request object containing metadata about the upload request. + + Returns + ------- + None + """ diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index fbc62eb..08cc550 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -96,6 +96,26 @@ async def upload_source( request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: + """ + Asynchronously uploads user-selected source documents. + + Parameters + ---------- + source_type : StrictStr + The type of the source document to be uploaded. + name : StrictStr + The name of the source document to be uploaded. + kwargs : list[KeyValuePair] + Additional parameters required for the extractor. + request : Request + The HTTP request object containing metadata about the upload request. + source_uploader : SourceUploader + An instance of SourceUploader to handle the upload process. + + Returns + ------- + None + """ await source_uploader.upload_source(str(request.base_url), source_type, name, kwargs) @inject @@ -105,6 +125,22 @@ async def upload_file( request: Request, file_uploader: FileUploader = Depends(Provide[DependencyContainer.file_uploader]), ) -> None: + """ + Asynchronously uploads a file to the server. + + Parameters + ---------- + file : UploadFile + The file object to be uploaded. + request : Request + The HTTP request object containing metadata about the upload request. + file_uploader : FileUploader, optional + An instance of FileUploader to handle the upload process. + + Returns + ------- + None + """ await file_uploader.upload_file(str(request.base_url), file) @inject diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 2770553..1e1ed33 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -98,7 +98,7 @@ async def upload_source( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, base_url, source_type, name, kwargs)) + target=lambda: run(self._handle_source_upload(source_name, source_type, kwargs)) ) thread.start() self._background_threads.append(thread) @@ -113,9 +113,7 @@ async def upload_source( async def _handle_source_upload( self, source_name: str, - base_url: str, source_type: StrictStr, - name: str, kwargs: list[KeyValuePair], ): try: @@ -135,8 +133,7 @@ async def _handle_source_upload( self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents ] - # Replace old document - # deletion is allowed to fail + # Replace old document, deletion is allowed to fail with suppress(Exception): await self._document_deleter.adelete_document(source_name) diff --git a/admin-api-lib/src/admin_api_lib/models/document_status.py b/admin-api-lib/src/admin_api_lib/models/document_status.py index ff2f94a..89b09d8 100644 --- a/admin-api-lib/src/admin_api_lib/models/document_status.py +++ b/admin-api-lib/src/admin_api_lib/models/document_status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) @@ -49,8 +49,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py index 7e288e1..28c83f0 100644 --- a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -48,9 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) - + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of HTTPValidationError from a JSON string""" diff --git a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py index 82c0c37..3d46e01 100644 --- a/admin-api-lib/src/admin_api_lib/models/key_value_pair.py +++ b/admin-api-lib/src/admin_api_lib/models/key_value_pair.py @@ -48,8 +48,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/admin-api-lib/src/admin_api_lib/models/status.py b/admin-api-lib/src/admin_api_lib/models/status.py index 0ab750b..3b24b73 100644 --- a/admin-api-lib/src/admin_api_lib/models/status.py +++ b/admin-api-lib/src/admin_api_lib/models/status.py @@ -3,7 +3,7 @@ """ admin-api-lib -The API is used for the communication between the admin frontend and the admin backend in the rag project. +The API is used for the communication between the admin frontend and the admin backend in the rag project. The version of the OpenAPI document: 1.0.0 Generated by OpenAPI Generator (https://openapi-generator.tech) diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error.py b/admin-api-lib/src/admin_api_lib/models/validation_error.py index f922b21..ac389ab 100644 --- a/admin-api-lib/src/admin_api_lib/models/validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/validation_error.py @@ -50,8 +50,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py index 0100c88..e487669 100644 --- a/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py +++ b/admin-api-lib/src/admin_api_lib/models/validation_error_loc_inner.py @@ -55,8 +55,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py index 7d09897..4f9e4e5 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api.py @@ -1,3 +1,5 @@ +"""Module for the Extractor API.""" + # coding: utf-8 from typing import Dict, List # noqa: F401 @@ -48,6 +50,19 @@ async def extract_from_file_post( extraction_request: ExtractionRequest = Body(None, description=""), ) -> List[InformationPiece]: + """ + Extract information from a file based on the provided extraction request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request object containing details about the extraction process. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + """ if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request) @@ -67,6 +82,24 @@ async def extract_from_file_post( async def extract_from_source( extraction_parameters: ExtractionParameters = Body(None, description=""), ) -> List[InformationPiece]: + """ + Extract information from a source based on the provided extraction parameters. + + Parameters + ---------- + extraction_parameters : ExtractionParameters, optional + The request object containing details about the extraction process. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + + Raises + ------ + HTTPException + If the extraction process fails or encounters an error. + """ if not BaseExtractorApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index 696c60c..acb6022 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -1,3 +1,5 @@ +"""Module for the base ExtractorApi interface.""" + # coding: utf-8 from typing import ClassVar, Dict, List, Tuple # noqa: F401 @@ -8,6 +10,15 @@ class BaseExtractorApi: + """ + The base ExtractorApi interface. + + Attributes + ---------- + subclasses : ClassVar[Tuple] + A tuple containing all subclasses of BaseExtractorApi. + """ + subclasses: ClassVar[Tuple] = () def __init_subclass__(cls, **kwargs): @@ -17,9 +28,35 @@ def __init_subclass__(cls, **kwargs): async def extract_from_file_post( self, extraction_request: ExtractionRequest, - ) -> List[InformationPiece]: ... + ) -> List[InformationPiece]: + """ + Extract information from a file based on the provided extraction request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request object containing details about the extraction process. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + """ async def extract_from_source( self, extraction_parameters: ExtractionParameters, - ) -> List[InformationPiece]: ... + ) -> List[InformationPiece]: + """ + Extract information from a source based on the provided extraction request. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters required to access and extract information from the source. + + Returns + ------- + List[InformationPiece] + A list of extracted information pieces. + """ From 4bfd3f171f290b53e732ec61885cfafdf15fc279 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 28 May 2025 08:26:11 +0200 Subject: [PATCH 26/56] working sample --- .../api_endpoints/default_file_uploader.py | 24 ++- .../api_endpoints/default_source_uploader.py | 170 +++++++++++++----- .../tests/default_source_uploader_test.py | 145 +++++++++++++++ admin-api-lib/tests/dummy_test.py | 3 - .../tests/test_default_source_uploader.py | 0 .../tests/{dummy_test.py => dummy5_test.py} | 0 rag-core-api/tests/rag_api_test.py | 26 +-- .../tests/{dummy_test.py => dummy6_test.py} | 0 8 files changed, 309 insertions(+), 59 deletions(-) create mode 100644 admin-api-lib/tests/default_source_uploader_test.py delete mode 100644 admin-api-lib/tests/dummy_test.py create mode 100644 admin-api-lib/tests/test_default_source_uploader.py rename extractor-api-lib/tests/{dummy_test.py => dummy5_test.py} (100%) rename rag-core-lib/tests/{dummy_test.py => dummy6_test.py} (100%) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index b9b367f..fed469d 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -98,7 +98,7 @@ async def upload_file( content = await file.read() file.filename = sanitize_document_name(file.filename) source_name = f"file:{sanitize_document_name(file.filename)}" - # TODO: check if document already in processing state + self._check_if_already_in_processing(source_name) self._key_value_store.upsert( source_name, Status.PROCESSING ) # TODO: change to pipeline with timeout to error status @@ -116,6 +116,28 @@ async def upload_file( logger.error("Error while uploading %s = %s", source_name, str(e)) raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + def _check_if_already_in_processing(self, source_name: str) -> None: + """ + Checks if the source is already in processing state. + + Parameters + ---------- + source_name : str + The name of the source. + + Returns + ------- + None + + Raises + ------ + ValueError + If the source is already in processing state. + """ + existing = [s for name, s in self._key_value_store.get_all() if name == source_name] + if any(s == Status.PROCESSING for s in existing): + raise ValueError(f"Document {source_name} is already in processing state") + async def _handle_source_upload( self, s3_path: Path, diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 1e1ed33..d6ecd5e 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,12 +1,13 @@ -from http.client import HTTPException + +from concurrent.futures import ThreadPoolExecutor import logging -from asyncio import run -from threading import Thread +import asyncio +from threading import Thread, Event from contextlib import suppress from pydantic import StrictStr -from fastapi import status - +from fastapi import status, HTTPException +from langchain_core.documents import Document from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters @@ -20,9 +21,14 @@ from admin_api_lib.impl.key_db.file_status_key_value_store import FileStatusKeyValueStore from admin_api_lib.information_enhancer.information_enhancer import InformationEnhancer from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.rag_backend_client.openapi_client.models.information_piece import ( + InformationPiece as RagInformationPiece, +) logger = logging.getLogger(__name__) +class UploadCancelled(Exception): + pass class DefaultSourceUploader(SourceUploader): @@ -63,7 +69,7 @@ def __init__( self._information_enhancer = information_enhancer self._chunker = chunker self._document_deleter = document_deleter - self._background_threads = [] + self._background_tasks = [] async def upload_source( self, @@ -71,75 +77,155 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], + timeout: float = 300.0, ) -> None: - """ - Uploads the parameters for source content extraction. - - Parameters - ---------- - base_url : str - The base url of the service. Is used to determine the download link of the source. - source_type : str - The type of the source. Is used by the extractor service to determine the correct extraction method. - name : str - Display name of the source. - kwargs : list[KeyValuePair] - List of KeyValuePair with parameters used for the extraction. + # 1) prune finished tasks + self._background_tasks = [ + (fut, ev) for fut, ev in self._background_tasks + if not fut.done() + ] - Returns - ------- - None - """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" try: - # TODO: check if document already in processing state - self._key_value_store.upsert( - source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status - thread = Thread( - target=lambda: run(self._handle_source_upload(source_name, source_type, kwargs)) + self._check_if_already_in_processing(source_name) + self._key_value_store.upsert(source_name, Status.PROCESSING) + + # 1) make a stop‐event for cooperative cancellation + stop_event = Event() + + # 2) submit the real work to a ThreadPoolExecutor + loop = asyncio.get_running_loop() + # you can reuse one executor or make a new one + executor = ThreadPoolExecutor(max_workers=1) + future = loop.run_in_executor( + executor, + lambda: asyncio.run( + self._handle_source_upload( + source_name, source_type, kwargs, stop_event + ) + ) ) - thread.start() - self._background_threads.append(thread) + # track both thread‐future and its stop‐event + self._background_tasks.append((future, stop_event)) + + # 3) await with a timeout, *without* blocking the loop + try: + await asyncio.wait_for(future, timeout) + except asyncio.TimeoutError: + # mark error, signal the thread, and move on + self._key_value_store.upsert(source_name, Status.ERROR) + stop_event.set() + logger.error("Upload of %s timed out; signaled stop_event", source_name) + except ValueError as e: self._key_value_store.upsert(source_name, Status.ERROR) - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, detail=str(e) + ) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) + ) + + + def _on_upload_timeout(self, source_name: str, thread: Thread) -> None: + """ + Called by the event loop after `timeout` seconds. + Sets the stop_event so that the worker can exit cleanly. + """ + if thread.is_alive(): + logger.error("Upload of %s timed out; signaling thread to stop", source_name) + # mark as error in your store + self._key_value_store.upsert(source_name, Status.ERROR) + # signal the worker to bail out at next checkpoint + thread.stop_event.set() + + + def _check_if_already_in_processing(self, source_name: str) -> None: + """ + Checks if the source is already in processing state. + + Parameters + ---------- + source_name : str + The name of the source. + + Returns + ------- + None + + Raises + ------ + ValueError + If the source is already in processing state. + """ + existing = [s for name, s in self._key_value_store.get_all() if name == source_name] + if any(s == Status.PROCESSING for s in existing): + raise ValueError(f"Document {source_name} is already in processing state") + + @staticmethod + def _ensure_not_cancelled(stop_event, source_name, store): + if stop_event.is_set(): + # mark as error or cancelled if you like + store.upsert(source_name, Status.ERROR) + raise UploadCancelled() async def _handle_source_upload( self, source_name: str, source_type: StrictStr, kwargs: list[KeyValuePair], + stop_event: Event ): try: information_pieces = self._extractor_api.extract_from_source( - ExtractionParameters(source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs]) + ExtractionParameters( + source_type=source_type, + document_name=source_name, + kwargs=[x.to_dict() for x in kwargs] + ) ) if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + return + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) + documents: list[Document] = [] + for piece in information_pieces: + documents.append(self._information_mapper.extractor_information_piece2document(piece)) + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) chunked_documents = self._chunker.chunk(documents) + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - rag_information_pieces = [ - self._information_mapper.document2rag_information_piece(doc) for doc in enhanced_documents - ] - # Replace old document, deletion is allowed to fail + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) + rag_information_pieces: list[RagInformationPiece] = [] + for doc in enhanced_documents: + rag_information_pieces.append( + self._information_mapper.document2rag_information_piece(doc) + ) + + DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) with suppress(Exception): await self._document_deleter.adelete_document(source_name) self._rag_api.upload_information_piece(rag_information_pieces) + self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) + + except UploadCancelled: + logger.info("Upload of %s aborted by timeout", source_name) + return except Exception as e: - self._key_value_store.upsert(source_name, Status.ERROR) - logger.error("Error while uploading %s = %s", source_name, str(e)) + # If it wasn’t our own cancellation, record the error + if stop_event.is_set(): + logger.info("Upload of %s aborted due to timeout", source_name) + else: + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py new file mode 100644 index 0000000..edfa823 --- /dev/null +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -0,0 +1,145 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock +from fastapi import HTTPException +import threading, time + +from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader +from admin_api_lib.models.status import Status +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.impl.api_endpoints import default_source_uploader + +@pytest.fixture +def mocks(): + extractor_api = MagicMock() + key_value_store = MagicMock() + key_value_store.get_all.return_value = [] + information_enhancer = MagicMock() + information_enhancer.ainvoke = AsyncMock() + chunker = MagicMock() + document_deleter = MagicMock() + document_deleter.adelete_document = AsyncMock() + rag_api = MagicMock() + information_mapper = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + + +@pytest.mark.asyncio +async def test_handle_source_upload_success(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + # Setup mocks + dummy_piece = MagicMock() + extractor_api.extract_from_source.return_value = [dummy_piece] + dummy_doc = MagicMock() + information_mapper.extractor_information_piece2document.return_value = dummy_doc + chunker.chunk.return_value = [dummy_doc] + information_enhancer.ainvoke.return_value = [dummy_doc] + dummy_rag_piece = {"p": "v"} + information_mapper.document2rag_information_piece.return_value = dummy_rag_piece + + uploader = DefaultSourceUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + ) + + await uploader._handle_source_upload("source1", "type1", []) + + key_value_store.upsert.assert_any_call("source1", Status.READY) + rag_api.upload_information_piece.assert_called_once_with([dummy_rag_piece]) + document_deleter.adelete_document.assert_awaited_once_with("source1") + + +@pytest.mark.asyncio +async def test_handle_source_upload_no_info_pieces(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api.extract_from_source.return_value = [] + + uploader = DefaultSourceUploader( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + ) + await uploader._handle_source_upload("source2", "type2", []) + + key_value_store.upsert.assert_any_call("source2", Status.ERROR) + information_mapper.extractor_information_piece2document.assert_not_called() + rag_api.upload_information_piece.assert_not_called() + + +@pytest.mark.asyncio +async def test_upload_source_already_processing_raises_error(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + source_type = "typeX" + name = "Doc Name" + source_name = f"{source_type}:{sanitize_document_name(name)}" + key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + with pytest.raises(HTTPException): + await uploader.upload_source("http://base", source_type, name, []) + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + + +@pytest.mark.asyncio +async def test_upload_source_not_processing_starts_thread(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + dummy_thread = MagicMock() + monkeypatch.setattr('admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', lambda *args, **kwargs: dummy_thread) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + await uploader.upload_source("http://base", "typeY", "nameY", []) + key_value_store.upsert.assert_any_call(f"typeY:{sanitize_document_name('nameY')}", Status.PROCESSING) + dummy_thread.start.assert_called_once() + +@pytest.mark.asyncio +async def test_upload_source_no_timeout(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + source_type = "typeZ" + name = "quick" + source_name = f"{source_type}:{sanitize_document_name(name)}" + # dummy thread that finishes before timeout + dummy_thread = MagicMock() + dummy_thread.is_alive.return_value = False + monkeypatch.setattr( + 'admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', + lambda *args, **kwargs: dummy_thread + ) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + # should not raise + await uploader.upload_source("http://base", source_type, name, []) + # only PROCESSING status upserted, no ERROR + key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) + +@pytest.mark.asyncio +async def test_upload_source_timeout_error(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + key_value_store.get_all.return_value = [] + source_type = "typeTimeout" + name = "slow" + source_name = f"{source_type}:{sanitize_document_name(name)}" + # simulate slow thread sleeping 2s; patch timeout to 1s + def slow_thread_factory(*args, **kwargs): + return threading.Thread(target=lambda: time.sleep(2), daemon=True) + monkeypatch.setattr(default_source_uploader, 'Thread', slow_thread_factory) + uploader = DefaultSourceUploader( + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + ) + with pytest.raises(HTTPException) as exc: + await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + assert "timed out" in exc.value.detail + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) diff --git a/admin-api-lib/tests/dummy_test.py b/admin-api-lib/tests/dummy_test.py deleted file mode 100644 index 1428394..0000000 --- a/admin-api-lib/tests/dummy_test.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_dummy() -> None: - print("Dummy test.") - assert True diff --git a/admin-api-lib/tests/test_default_source_uploader.py b/admin-api-lib/tests/test_default_source_uploader.py new file mode 100644 index 0000000..e69de29 diff --git a/extractor-api-lib/tests/dummy_test.py b/extractor-api-lib/tests/dummy5_test.py similarity index 100% rename from extractor-api-lib/tests/dummy_test.py rename to extractor-api-lib/tests/dummy5_test.py diff --git a/rag-core-api/tests/rag_api_test.py b/rag-core-api/tests/rag_api_test.py index 372709c..2cbdf8e 100644 --- a/rag-core-api/tests/rag_api_test.py +++ b/rag-core-api/tests/rag_api_test.py @@ -14,23 +14,23 @@ from qdrant_client import QdrantClient from qdrant_client.http import models -from .mock_environment_variables import mock_environment_variables -from .mock_logging_directory import mock_logging_config +from mock_environment_variables import mock_environment_variables +from mock_logging_directory import mock_logging_config mock_environment_variables() mock_logging_config() -from src.rag_core_api.main import app -from src.rag_core_api.models.chat_request import ChatRequest -from src.rag_core_api.models.chat_history import ChatHistory -from src.rag_core_api.models.chat_history_message import ChatHistoryMessage -from src.rag_core_api.models.chat_role import ChatRole -from src.rag_core_api.models.information_piece import InformationPiece -from src.rag_core_api.models.content_type import ContentType -from src.rag_core_api.models.key_value_pair import KeyValuePair -from src.rag_core_api.models.delete_request import DeleteRequest -from src.rag_core_api.impl.settings.fake_embedder_settings import FakeEmbedderSettings -from src.rag_core_api.impl.settings.error_messages import ErrorMessages +from rag_core_api.main import app +from rag_core_api.models.chat_request import ChatRequest +from rag_core_api.models.chat_history import ChatHistory +from rag_core_api.models.chat_history_message import ChatHistoryMessage +from rag_core_api.models.chat_role import ChatRole +from rag_core_api.models.information_piece import InformationPiece +from rag_core_api.models.content_type import ContentType +from rag_core_api.models.key_value_pair import KeyValuePair +from rag_core_api.models.delete_request import DeleteRequest +from rag_core_api.impl.settings.fake_embedder_settings import FakeEmbedderSettings +from rag_core_api.impl.settings.error_messages import ErrorMessages @pytest_asyncio.fixture diff --git a/rag-core-lib/tests/dummy_test.py b/rag-core-lib/tests/dummy6_test.py similarity index 100% rename from rag-core-lib/tests/dummy_test.py rename to rag-core-lib/tests/dummy6_test.py From c07d93957e16def55c146c2240d0e69e0582c5e0 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 28 May 2025 11:43:33 +0200 Subject: [PATCH 27/56] refactor: improve threading model in DefaultSourceUploader and update timeout handling --- .../api_endpoints/default_source_uploader.py | 100 +++++------------- .../tests/default_source_uploader_test.py | 62 +++++------ 2 files changed, 59 insertions(+), 103 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index d6ecd5e..c0d1389 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,5 +1,5 @@ -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError import logging import asyncio from threading import Thread, Event @@ -27,9 +27,6 @@ logger = logging.getLogger(__name__) -class UploadCancelled(Exception): - pass - class DefaultSourceUploader(SourceUploader): def __init__( @@ -69,7 +66,7 @@ def __init__( self._information_enhancer = information_enhancer self._chunker = chunker self._document_deleter = document_deleter - self._background_tasks = [] + self._background_threads = [] async def upload_source( self, @@ -77,46 +74,18 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - timeout: float = 300.0, + timeout: float = 3600.0, ) -> None: - # 1) prune finished tasks - self._background_tasks = [ - (fut, ev) for fut, ev in self._background_tasks - if not fut.done() - ] + self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" try: self._check_if_already_in_processing(source_name) self._key_value_store.upsert(source_name, Status.PROCESSING) - # 1) make a stop‐event for cooperative cancellation - stop_event = Event() - - # 2) submit the real work to a ThreadPoolExecutor - loop = asyncio.get_running_loop() - # you can reuse one executor or make a new one - executor = ThreadPoolExecutor(max_workers=1) - future = loop.run_in_executor( - executor, - lambda: asyncio.run( - self._handle_source_upload( - source_name, source_type, kwargs, stop_event - ) - ) - ) - # track both thread‐future and its stop‐event - self._background_tasks.append((future, stop_event)) - - # 3) await with a timeout, *without* blocking the loop - try: - await asyncio.wait_for(future, timeout) - except asyncio.TimeoutError: - # mark error, signal the thread, and move on - self._key_value_store.upsert(source_name, Status.ERROR) - stop_event.set() - logger.error("Upload of %s timed out; signaled stop_event", source_name) - + thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, timeout)) + thread.start() + self._background_threads.append(thread) except ValueError as e: self._key_value_store.upsert(source_name, Status.ERROR) raise HTTPException( @@ -130,19 +99,6 @@ async def upload_source( ) - def _on_upload_timeout(self, source_name: str, thread: Thread) -> None: - """ - Called by the event loop after `timeout` seconds. - Sets the stop_event so that the worker can exit cleanly. - """ - if thread.is_alive(): - logger.error("Upload of %s timed out; signaling thread to stop", source_name) - # mark as error in your store - self._key_value_store.upsert(source_name, Status.ERROR) - # signal the worker to bail out at next checkpoint - thread.stop_event.set() - - def _check_if_already_in_processing(self, source_name: str) -> None: """ Checks if the source is already in processing state. @@ -165,19 +121,30 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") - @staticmethod - def _ensure_not_cancelled(stop_event, source_name, store): - if stop_event.is_set(): - # mark as error or cancelled if you like - store.upsert(source_name, Status.ERROR) - raise UploadCancelled() + def _thread_worker(self,source_name, source_type, kwargs, timeout): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + asyncio.wait_for( + self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs), + timeout=timeout + ) + ) + except asyncio.TimeoutError: + logger.error("Upload of %s timed out after %s seconds", source_name, timeout) + self._key_value_store.upsert(source_name, Status.ERROR) + except Exception as e: + logger.exception("Error while uploading %s", source_name) + self._key_value_store.upsert(source_name, Status.ERROR) + finally: + loop.close() async def _handle_source_upload( self, source_name: str, source_type: StrictStr, kwargs: list[KeyValuePair], - stop_event: Event ): try: information_pieces = self._extractor_api.extract_from_source( @@ -192,25 +159,20 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) return - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) documents: list[Document] = [] for piece in information_pieces: documents.append(self._information_mapper.extractor_information_piece2document(piece)) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) chunked_documents = self._chunker.chunk(documents) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) rag_information_pieces: list[RagInformationPiece] = [] for doc in enhanced_documents: rag_information_pieces.append( self._information_mapper.document2rag_information_piece(doc) ) - DefaultSourceUploader._ensure_not_cancelled(stop_event, source_name, self._key_value_store) with suppress(Exception): await self._document_deleter.adelete_document(source_name) @@ -218,14 +180,6 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) - - except UploadCancelled: - logger.info("Upload of %s aborted by timeout", source_name) - return except Exception as e: - # If it wasn’t our own cancellation, record the error - if stop_event.is_set(): - logger.info("Upload of %s aborted due to timeout", source_name) - else: - self._key_value_store.upsert(source_name, Status.ERROR) - logger.error("Error while uploading %s = %s", source_name, str(e)) + self._key_value_store.upsert(source_name, Status.ERROR) + logger.error("Error while uploading %s = %s", source_name, str(e)) diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index edfa823..51a2963 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -1,3 +1,4 @@ +import asyncio import pytest from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException @@ -88,20 +89,6 @@ async def test_upload_source_already_processing_raises_error(mocks): await uploader.upload_source("http://base", source_type, name, []) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) - -@pytest.mark.asyncio -async def test_upload_source_not_processing_starts_thread(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks - key_value_store.get_all.return_value = [] - dummy_thread = MagicMock() - monkeypatch.setattr('admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', lambda *args, **kwargs: dummy_thread) - uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper - ) - await uploader.upload_source("http://base", "typeY", "nameY", []) - key_value_store.upsert.assert_any_call(f"typeY:{sanitize_document_name('nameY')}", Status.PROCESSING) - dummy_thread.start.assert_called_once() - @pytest.mark.asyncio async def test_upload_source_no_timeout(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks @@ -109,21 +96,19 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): source_type = "typeZ" name = "quick" source_name = f"{source_type}:{sanitize_document_name(name)}" - # dummy thread that finishes before timeout + # patch Thread so no actual background work is done dummy_thread = MagicMock() - dummy_thread.is_alive.return_value = False - monkeypatch.setattr( - 'admin_api_lib.impl.api_endpoints.default_source_uploader.Thread', - lambda *args, **kwargs: dummy_thread - ) + monkeypatch.setattr(default_source_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) # should not raise - await uploader.upload_source("http://base", source_type, name, []) + await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) # only PROCESSING status upserted, no ERROR - key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + assert any(call.args[1] == Status.PROCESSING for call in key_value_store.upsert.call_args_list) assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) + dummy_thread.start.assert_called_once() + @pytest.mark.asyncio async def test_upload_source_timeout_error(mocks, monkeypatch): @@ -132,14 +117,31 @@ async def test_upload_source_timeout_error(mocks, monkeypatch): source_type = "typeTimeout" name = "slow" source_name = f"{source_type}:{sanitize_document_name(name)}" - # simulate slow thread sleeping 2s; patch timeout to 1s - def slow_thread_factory(*args, **kwargs): - return threading.Thread(target=lambda: time.sleep(2), daemon=True) - monkeypatch.setattr(default_source_uploader, 'Thread', slow_thread_factory) + # monkey-patch the handler to sleep so that timeout triggers + async def fake_handle(self, source_name_arg, source_type_arg, kwargs_arg): + await asyncio.sleep(3600) + # patch handler and Thread to trigger timeout synchronously + monkeypatch.setattr( + default_source_uploader.DefaultSourceUploader, + '_handle_source_upload', + fake_handle + ) + def FakeThread(target, args=(), **kwargs): + # this ensures serial execution, so that the error status can be checked + class T: + def start(self_inner): + target(*args) + def is_alive(self_inner): + return False + return T() + monkeypatch.setattr(default_source_uploader, 'Thread', FakeThread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) - with pytest.raises(HTTPException) as exc: - await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) - assert "timed out" in exc.value.detail - key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + # no exception should be raised; timeout path sets ERROR status + + await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + # first call marks PROCESSING, second marks ERROR + calls = [call.args for call in key_value_store.upsert.call_args_list] + assert (source_name, Status.PROCESSING) in calls + assert (source_name, Status.ERROR) in calls From a46b4fdb5c0f3681d65c6e6424788803f9d407d4 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 28 May 2025 12:55:36 +0200 Subject: [PATCH 28/56] feat: add timeout parameter to file and source upload methods and enhance documentation --- .../api_endpoints/file_uploader.py | 4 + .../api_endpoints/source_uploader.py | 7 +- .../src/admin_api_lib/apis/admin_api.py | 27 +++- .../src/admin_api_lib/apis/admin_api_base.py | 10 +- .../src/admin_api_lib/impl/admin_api.py | 5 +- .../api_endpoints/default_file_uploader.py | 34 ++++- .../api_endpoints/default_source_uploader.py | 22 ++- .../tests/default_file_uploader_test.py | 142 ++++++++++++++++++ .../tests/default_source_uploader_test.py | 7 +- 9 files changed, 233 insertions(+), 25 deletions(-) create mode 100644 admin-api-lib/tests/default_file_uploader_test.py diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index b8594c7..d146a1b 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,6 +1,7 @@ """Module for the upload file endpoint.""" from abc import ABC, abstractmethod +from typing import Optional from fastapi import UploadFile @@ -12,6 +13,7 @@ async def upload_file( self, base_url: str, file: UploadFile, + timeout: Optional[float], ) -> None: """ Uploads a source file for content extraction. @@ -22,6 +24,8 @@ async def upload_file( The base url of the service. Is used to determine the download link of the file. file : UploadFile The file to process. + timeout : float, optional + Timeout for the operation. Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index f4b4e03..95c9d6e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,6 +1,7 @@ """Module for the upload source endpoint.""" from abc import ABC, abstractmethod +from typing import Optional from pydantic import StrictStr @@ -13,24 +14,24 @@ class SourceUploader(ABC): @abstractmethod async def upload_source( self, - base_url: str, source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], + timeout: Optional[float], ) -> None: """ Uploads the parameters for source content extraction. Parameters ---------- - base_url : str - The base url of the service. Is used to determine the download link of the source. source_type : str The type of the source. Is used by the extractor service to determine the correct extraction method. name : str Display name of the source. kwargs : list[KeyValuePair] List of KeyValuePair with parameters used for the extraction. + timeout : float, optional + Timeout for the operation. Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index 5a332be..a323bd6 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -148,7 +148,16 @@ async def upload_file( file: UploadFile, request: Request, ) -> None: - """Uploads user selected sources.""" + """ + Uploads user selected sources. + + Parameters + ---------- + file : UploadFile + The file to be uploaded. + request : Request + The HTTP request object containing metadata about the upload request. + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") return await BaseAdminApi.subclasses[0]().upload_file(file, request) @@ -167,12 +176,22 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - request: Request, source_type: StrictStr = Query(None, description="", alias="type"), name: StrictStr = Query(None, description="", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: - """Uploads user selected sources.""" + """ + Uploads user selected sources. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extractor to use. + name : str + Display name of the source. + key_value_pair : List[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + """ if not BaseAdminApi.subclasses: raise HTTPException(status_code=500, detail="Not implemented") - return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair, request) + return await BaseAdminApi.subclasses[0]().upload_source(source_type, name, key_value_pair) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py index 432c457..e3841b9 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api_base.py @@ -81,11 +81,19 @@ async def upload_source( source_type: StrictStr, name: StrictStr, key_value_pair: List[KeyValuePair], - request: Request, ) -> None: """ Asynchronously uploads user selected source. + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extractor to use. + name : str + Display name of the source. + key_value_pair : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + Returns ------- None diff --git a/admin-api-lib/src/admin_api_lib/impl/admin_api.py b/admin-api-lib/src/admin_api_lib/impl/admin_api.py index 08cc550..4ecdd4c 100644 --- a/admin-api-lib/src/admin_api_lib/impl/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/impl/admin_api.py @@ -93,7 +93,6 @@ async def upload_source( source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - request: Request, source_uploader: SourceUploader = Depends(Provide[DependencyContainer.source_uploader]), ) -> None: """ @@ -107,8 +106,6 @@ async def upload_source( The name of the source document to be uploaded. kwargs : list[KeyValuePair] Additional parameters required for the extractor. - request : Request - The HTTP request object containing metadata about the upload request. source_uploader : SourceUploader An instance of SourceUploader to handle the upload process. @@ -116,7 +113,7 @@ async def upload_source( ------- None """ - await source_uploader.upload_source(str(request.base_url), source_type, name, kwargs) + await source_uploader.upload_source(source_type, name, kwargs) @inject async def upload_file( diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index fed469d..5a61b02 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -1,13 +1,14 @@ -from http.client import HTTPException +import asyncio import logging from pathlib import Path import traceback from threading import Thread +from typing import Optional import urllib import tempfile from contextlib import suppress -from fastapi import UploadFile, status +from fastapi import UploadFile, status, HTTPException from langchain_core.documents import Document from asyncio import run @@ -77,6 +78,7 @@ async def upload_file( self, base_url: str, file: UploadFile, + timeout: Optional[float] = 3600.0, ) -> None: """ Uploads a source file for content extraction. @@ -99,13 +101,9 @@ async def upload_file( file.filename = sanitize_document_name(file.filename) source_name = f"file:{sanitize_document_name(file.filename)}" self._check_if_already_in_processing(source_name) - self._key_value_store.upsert( - source_name, Status.PROCESSING - ) # TODO: change to pipeline with timeout to error status + self._key_value_store.upsert(source_name, Status.PROCESSING) s3_path = await self._asave_new_document(content, file.filename, source_name) - thread = Thread( - target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) - ) + thread = Thread(target=self._thread_worker, args=(s3_path, source_name, file.filename, base_url, timeout)) thread.start() self._background_threads.append(thread) except ValueError as e: @@ -138,6 +136,25 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") + def _thread_worker(self,s3_path, source_name, filename, base_url, timeout): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + asyncio.wait_for( + self._handle_source_upload(s3_path, source_name, filename, base_url), + timeout=timeout + ) + ) + except asyncio.TimeoutError: + logger.error("Upload of %s timed out after %s seconds", source_name, timeout) + self._key_value_store.upsert(source_name, Status.ERROR) + except Exception as e: + logger.exception("Error while uploading %s", source_name) + self._key_value_store.upsert(source_name, Status.ERROR) + finally: + loop.close() + async def _handle_source_upload( self, s3_path: Path, @@ -153,6 +170,7 @@ async def _handle_source_upload( if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) + raise Exception("No information pieces found") documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] chunked_documents = self._chunker.chunk(documents) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index c0d1389..c91fd75 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -70,12 +70,30 @@ def __init__( async def upload_source( self, - base_url: str, source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], timeout: float = 3600.0, ) -> None: + """ + Uploads the parameters for source content extraction. + + Parameters + ---------- + source_type : str + The type of the source. Is used by the extractor service to determine the correct extraction method. + name : str + Display name of the source. + kwargs : list[KeyValuePair] + List of KeyValuePair with parameters used for the extraction. + timeout : float, optional + Timeout for the operation, by default 3600.0 seconds (1 hour). + + Returns + ------- + None + """ + self._background_threads = [t for t in self._background_threads if t.is_alive()] source_name = f"{source_type}:{sanitize_document_name(name)}" @@ -158,7 +176,7 @@ async def _handle_source_upload( if not information_pieces: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) - return + raise Exception("No information pieces found") documents: list[Document] = [] for piece in information_pieces: documents.append(self._information_mapper.extractor_information_piece2document(piece)) diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py new file mode 100644 index 0000000..8cceb14 --- /dev/null +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -0,0 +1,142 @@ +import asyncio +import pytest +from unittest.mock import AsyncMock, MagicMock +from fastapi import HTTPException +from fastapi import UploadFile +import threading, time + +from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader +from admin_api_lib.models.status import Status +from admin_api_lib.utils.utils import sanitize_document_name +from admin_api_lib.impl.api_endpoints import default_file_uploader + +@ pytest.fixture +def mocks(): + extractor_api = MagicMock() + key_value_store = MagicMock() + key_value_store.get_all.return_value = [] + information_enhancer = MagicMock() + information_enhancer.ainvoke = AsyncMock() + chunker = MagicMock() + document_deleter = MagicMock() + document_deleter.adelete_document = AsyncMock() + rag_api = MagicMock() + information_mapper = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + +@ pytest.mark.asyncio +async def test_handle_file_upload_success(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + # setup mocks + dummy_piece = MagicMock() + extractor_api.extract_from_file_post.return_value = [dummy_piece] + dummy_doc = MagicMock() + information_mapper.extractor_information_piece2document.return_value = dummy_doc + chunker.chunk.return_value = [dummy_doc] + information_enhancer.ainvoke.return_value = [dummy_doc] + dummy_rag = {"foo": "bar"} + information_mapper.document2rag_information_piece.return_value = dummy_rag + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + await uploader._handle_source_upload("s3path", "file:doc1", "doc1.txt", "http://base") + + key_value_store.upsert.assert_any_call("file:doc1", Status.READY) + rag_api.upload_information_piece.assert_called_once_with([dummy_rag]) + document_deleter.adelete_document.assert_awaited_once_with("file:doc1") + +@ pytest.mark.asyncio +async def test_handle_file_upload_no_info_pieces(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api.extract_from_file_post.return_value = [] + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + filename = "file:doc2" + await uploader._handle_source_upload("s3path", filename, "doc2.txt", "http://base") + + key_value_store.upsert.assert_any_call(filename, Status.ERROR) + information_mapper.extractor_information_piece2document.assert_not_called() + rag_api.upload_information_piece.assert_not_called() + +@ pytest.mark.asyncio +async def test_upload_file_already_processing_raises_error(mocks): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "doc3.txt" + file.read = AsyncMock(return_value=b"") + source_name = f"file:{sanitize_document_name(file.filename)}" + key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + with pytest.raises(HTTPException): + await uploader.upload_file(base_url, file) + key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + +@ pytest.mark.asyncio +async def test_upload_file_starts_thread(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "doc4.txt" + file.read = AsyncMock(return_value=b"content") + key_value_store.get_all.return_value = [] + source_name = f"file:{sanitize_document_name(file.filename)}" + + dummy_thread = MagicMock() + monkeypatch.setattr(default_file_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + await uploader.upload_file(base_url, file) + + key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) + dummy_thread.start.assert_called_once() + +@ pytest.mark.asyncio +async def test_upload_file_timeout_error(mocks, monkeypatch): + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + base_url = "http://base" + file = MagicMock(spec=UploadFile) + file.filename = "slow.txt" + file.read = AsyncMock(return_value=b"") + key_value_store.get_all.return_value = [] + source_name = f"file:{sanitize_document_name(file.filename)}" + + # fast fake handler that sleeps long + async def fake_handle(self, s3_path, source_name_arg, filename, base_url_arg): + await asyncio.sleep(3600) + monkeypatch.setattr( + default_file_uploader.DefaultFileUploader, + '_handle_source_upload', + fake_handle + ) + def FakeThread(target, args=(), **kwargs): + class T: + def start(self_inner): target(*args) + def is_alive(self_inner): return False + return T() + monkeypatch.setattr(default_file_uploader, 'Thread', FakeThread) + + uploader = DefaultFileUploader( + extractor_api, key_value_store, information_enhancer, chunker, + document_deleter, rag_api, information_mapper, file_service=MagicMock() + ) + + await uploader.upload_file(base_url, file, timeout=0.1) + calls = [c.args for c in key_value_store.upsert.call_args_list] + assert (source_name, Status.PROCESSING) in calls + assert (source_name, Status.ERROR) in calls diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index 51a2963..9210a0c 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -86,7 +86,8 @@ async def test_upload_source_already_processing_raises_error(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) with pytest.raises(HTTPException): - await uploader.upload_source("http://base", source_type, name, []) + # use default timeout + await uploader.upload_source(source_type, name, []) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) @pytest.mark.asyncio @@ -103,7 +104,7 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) # should not raise - await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + await uploader.upload_source(source_type, name, [], timeout=1.0) # only PROCESSING status upserted, no ERROR assert any(call.args[1] == Status.PROCESSING for call in key_value_store.upsert.call_args_list) assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) @@ -140,7 +141,7 @@ def is_alive(self_inner): ) # no exception should be raised; timeout path sets ERROR status - await uploader.upload_source("http://base", source_type, name, [], timeout=1.0) + await uploader.upload_source(source_type, name, [], timeout=1.0) # first call marks PROCESSING, second marks ERROR calls = [call.args for call in key_value_store.upsert.call_args_list] assert (source_name, Status.PROCESSING) in calls From e7599d16bd239fe1e7036da3f611523395736000 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:36:05 +0200 Subject: [PATCH 29/56] feat: implement UploaderBase class and enhance document deletion logic with optional key-value store removal --- admin-api-lib/docs/thread_management.md | 0 .../examples/thread_management_example.py | 0 admin-api-lib/pyproject.toml | 2 +- .../api_endpoints/document_deleter.py | 4 +- .../api_endpoints/file_uploader.py | 6 +- .../api_endpoints/source_uploader.py | 7 +- .../api_endpoints/uploader_base.py | 30 ++++++ .../api_endpoints/default_document_deleter.py | 7 +- .../api_endpoints/default_file_uploader.py | 35 ++---- .../api_endpoints/default_source_uploader.py | 41 +++---- .../managed_page_summary_enhancer.py | 0 .../models/http_validation_error.py | 1 + .../admin_api_lib/utils/thread_diagnostics.py | 0 .../tests/default_file_uploader_test.py | 100 +++++++++--------- .../tests/default_source_uploader_test.py | 28 ++--- .../tests/test_confluence_integration.py | 0 admin-api-lib/tests/test_thread_management.py | 0 extractor-api-lib/poetry.lock | 14 +-- extractor-api-lib/pyproject.toml | 2 +- .../apis/extractor_api_base.py | 2 +- .../api_endpoints/general_source_extractor.py | 4 +- rag-core-lib/poetry.lock | 14 +-- rag-core-lib/pyproject.toml | 1 + 23 files changed, 159 insertions(+), 139 deletions(-) create mode 100644 admin-api-lib/docs/thread_management.md create mode 100644 admin-api-lib/examples/thread_management_example.py create mode 100644 admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py create mode 100644 admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py create mode 100644 admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py create mode 100644 admin-api-lib/tests/test_confluence_integration.py create mode 100644 admin-api-lib/tests/test_thread_management.py diff --git a/admin-api-lib/docs/thread_management.md b/admin-api-lib/docs/thread_management.md new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/examples/thread_management_example.py b/admin-api-lib/examples/thread_management_example.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/pyproject.toml b/admin-api-lib/pyproject.toml index ec0de57..2668032 100644 --- a/admin-api-lib/pyproject.toml +++ b/admin-api-lib/pyproject.toml @@ -29,7 +29,7 @@ per-file-ignores = """ ./src/admin_api_lib/impl/admin_api.py: B008, ./src/admin_api_lib/dependency_container.py: CCE002,CCE001, ./src/admin_api_lib/apis/admin_api_base.py: WOT001, - ./tests/*: S101,S106,D100,D103,PT011 + ./tests/*: S101,S106,D100,D103,PT011,N802 ./src/admin_api_lib/impl/settings/confluence_settings.py: C901,N805, ./src/admin_api_lib/impl/utils/comma_separated_bool_list.py: R505, ./src/admin_api_lib/impl/utils/comma_separated_str_list.py: R505, diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py b/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py index 155baf0..3f222bc 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/document_deleter.py @@ -7,7 +7,7 @@ class DocumentDeleter(ABC): """Abstract base class for document deletion endpoint.""" @abstractmethod - async def adelete_document(self, identification: str) -> None: + async def adelete_document(self, identification: str, remove_from_key_value_store: bool = True) -> None: """ Delete a document by its identification asynchronously. @@ -15,6 +15,8 @@ async def adelete_document(self, identification: str) -> None: ---------- identification : str The unique identifier of the document to be deleted. + remove_from_key_value_store : bool, optional + If True, the document will also be removed from the key-value store (default is True). Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index d146a1b..f45636e 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,12 +1,14 @@ """Module for the upload file endpoint.""" -from abc import ABC, abstractmethod +from abc import abstractmethod from typing import Optional from fastapi import UploadFile +from admin_api_lib.api_endpoints.uploader_base import UploaderBase -class FileUploader(ABC): + +class FileUploader(UploaderBase): @abstractmethod async def upload_file( diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py index 95c9d6e..5a1c50a 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/source_uploader.py @@ -1,15 +1,16 @@ """Module for the upload source endpoint.""" -from abc import ABC, abstractmethod +from abc import abstractmethod from typing import Optional from pydantic import StrictStr +from admin_api_lib.api_endpoints.uploader_base import UploaderBase from admin_api_lib.models.key_value_pair import KeyValuePair -class SourceUploader(ABC): - """Abstract base class for source upload.""" +class SourceUploader(UploaderBase): + """Abstract base class for source uploader API endpoints.""" @abstractmethod async def upload_source( diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py b/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py new file mode 100644 index 0000000..a344dcc --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/uploader_base.py @@ -0,0 +1,30 @@ +"""Module for the base class of uploader API endpoints.""" + +from threading import Thread + + +class UploaderBase: + """Base class for uploader API endpoints.""" + + def __init__(self): + """ + Initialize the UploaderBase. + """ + self._background_threads = [] + + def _prune_background_threads(self) -> list[Thread]: + """ + Prune background threads that are no longer running. + + Returns + ------- + list[Thread] + A list of background threads that are still alive. + """ + tmp_background_threads = [] + for thread in self._background_threads: + if not thread.is_alive(): + thread.join() + else: + tmp_background_threads.append(thread) + self._background_threads = tmp_background_threads diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py index 9f3c414..3cf671f 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py @@ -41,7 +41,7 @@ def __init__(self, file_service: FileService, rag_api: RagApi, key_value_store: self._rag_api = rag_api self._key_value_store = key_value_store - async def adelete_document(self, identification: str) -> None: + async def adelete_document(self, identification: str, remove_from_key_value_store: bool = True) -> None: """ Asynchronously delete a document identified by the given identification string. @@ -55,6 +55,8 @@ async def adelete_document(self, identification: str) -> None: ---------- identification : str The unique identifier of the document to be deleted. + remove_from_key_value_store : bool, optional + If True, the document will also be removed from the key-value store (default is True). Raises ------ @@ -66,7 +68,8 @@ async def adelete_document(self, identification: str) -> None: # Delete the document from file service and vector database logger.debug("Deleting existing document: %s", identification) try: - self._key_value_store.remove(identification) + if remove_from_key_value_store: + self._key_value_store.remove(identification) self._file_service.delete_file(identification) except Exception as e: error_messages += f"Error while deleting {identification} from file storage\n {str(e)}\n" diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 5a61b02..b558f11 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -1,4 +1,3 @@ -import asyncio import logging from pathlib import Path import traceback @@ -64,6 +63,7 @@ def __init__( file_service : FileService The service for handling file operations on the S3 storage """ + super().__init__() self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -94,16 +94,18 @@ async def upload_file( ------- None """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] + self._prune_background_threads() try: - content = await file.read() file.filename = sanitize_document_name(file.filename) source_name = f"file:{sanitize_document_name(file.filename)}" self._check_if_already_in_processing(source_name) self._key_value_store.upsert(source_name, Status.PROCESSING) + content = await file.read() s3_path = await self._asave_new_document(content, file.filename, source_name) - thread = Thread(target=self._thread_worker, args=(s3_path, source_name, file.filename, base_url, timeout)) + thread = Thread( + target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) + ) #TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior thread.start() self._background_threads.append(thread) except ValueError as e: @@ -136,25 +138,6 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") - def _thread_worker(self,s3_path, source_name, filename, base_url, timeout): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - loop.run_until_complete( - asyncio.wait_for( - self._handle_source_upload(s3_path, source_name, filename, base_url), - timeout=timeout - ) - ) - except asyncio.TimeoutError: - logger.error("Upload of %s timed out after %s seconds", source_name, timeout) - self._key_value_store.upsert(source_name, Status.ERROR) - except Exception as e: - logger.exception("Error while uploading %s", source_name) - self._key_value_store.upsert(source_name, Status.ERROR) - finally: - loop.close() - async def _handle_source_upload( self, s3_path: Path, @@ -171,7 +154,9 @@ async def _handle_source_upload( self._key_value_store.upsert(source_name, Status.ERROR) logger.error("No information pieces found in the document: %s", source_name) raise Exception("No information pieces found") - documents = [self._information_mapper.extractor_information_piece2document(x) for x in information_pieces] + documents: list[Document] = [] + for piece in information_pieces: + documents.append(self._information_mapper.extractor_information_piece2document(piece)) chunked_documents = self._chunker.chunk(documents) @@ -184,7 +169,7 @@ async def _handle_source_upload( # Replace old document # deletion is allowed to fail with suppress(Exception): - await self._document_deleter.adelete_document(source_name) + await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False) self._rag_api.upload_information_piece(rag_information_pieces) self._key_value_store.upsert(source_name, Status.READY) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index c91fd75..71b09de 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -1,8 +1,6 @@ - -from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError import logging import asyncio -from threading import Thread, Event +from threading import Thread from contextlib import suppress from pydantic import StrictStr @@ -27,6 +25,7 @@ logger = logging.getLogger(__name__) + class DefaultSourceUploader(SourceUploader): def __init__( @@ -59,6 +58,7 @@ def __init__( information_mapper : InformationPiece2Document The mapper for converting information pieces to langchain documents. """ + super().__init__() self._extractor_api = extractor_api self._rag_api = rag_api self._key_value_store = key_value_store @@ -94,7 +94,7 @@ async def upload_source( None """ - self._background_threads = [t for t in self._background_threads if t.is_alive()] + self._prune_background_threads() source_name = f"{source_type}:{sanitize_document_name(name)}" try: @@ -106,16 +106,11 @@ async def upload_source( self._background_threads.append(thread) except ValueError as e: self._key_value_store.upsert(source_name, Status.ERROR) - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, detail=str(e) - ) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) except Exception as e: self._key_value_store.upsert(source_name, Status.ERROR) logger.error("Error while uploading %s = %s", source_name, str(e)) - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) - ) - + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) def _check_if_already_in_processing(self, source_name: str) -> None: """ @@ -139,21 +134,21 @@ def _check_if_already_in_processing(self, source_name: str) -> None: if any(s == Status.PROCESSING for s in existing): raise ValueError(f"Document {source_name} is already in processing state") - def _thread_worker(self,source_name, source_type, kwargs, timeout): + def _thread_worker(self, source_name, source_type, kwargs, timeout): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: loop.run_until_complete( asyncio.wait_for( self._handle_source_upload(source_name=source_name, source_type=source_type, kwargs=kwargs), - timeout=timeout + timeout=timeout, ) ) except asyncio.TimeoutError: logger.error("Upload of %s timed out after %s seconds", source_name, timeout) self._key_value_store.upsert(source_name, Status.ERROR) - except Exception as e: - logger.exception("Error while uploading %s", source_name) + except Exception: + logger.error("Error while uploading %s", source_name) self._key_value_store.upsert(source_name, Status.ERROR) finally: loop.close() @@ -167,9 +162,7 @@ async def _handle_source_upload( try: information_pieces = self._extractor_api.extract_from_source( ExtractionParameters( - source_type=source_type, - document_name=source_name, - kwargs=[x.to_dict() for x in kwargs] + source_type=source_type, document_name=source_name, kwargs=[x.to_dict() for x in kwargs] ) ) @@ -183,19 +176,19 @@ async def _handle_source_upload( chunked_documents = self._chunker.chunk(documents) - enhanced_documents = await self._information_enhancer.ainvoke(chunked_documents) + # limit concurrency to avoid spawning multiple threads per call + enhanced_documents = await self._information_enhancer.ainvoke( + chunked_documents, config={"max_concurrency": 1} + ) rag_information_pieces: list[RagInformationPiece] = [] for doc in enhanced_documents: - rag_information_pieces.append( - self._information_mapper.document2rag_information_piece(doc) - ) + rag_information_pieces.append(self._information_mapper.document2rag_information_piece(doc)) with suppress(Exception): - await self._document_deleter.adelete_document(source_name) + await self._document_deleter.adelete_document(source_name, remove_from_key_value_store=False) self._rag_api.upload_information_piece(rag_information_pieces) - self._key_value_store.upsert(source_name, Status.READY) logger.info("Source uploaded successfully: %s", source_name) except Exception as e: diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py index 28c83f0..7d5feeb 100644 --- a/admin-api-lib/src/admin_api_lib/models/http_validation_error.py +++ b/admin-api-lib/src/admin_api_lib/models/http_validation_error.py @@ -49,6 +49,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" return self.model_dump_json(by_alias=True, exclude_unset=True) + @classmethod def from_json(cls, json_str: str) -> Self: """Create an instance of HTTPValidationError from a JSON string""" diff --git a/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py b/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py index 8cceb14..19318e9 100644 --- a/admin-api-lib/tests/default_file_uploader_test.py +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -3,14 +3,14 @@ from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException from fastapi import UploadFile -import threading, time from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader from admin_api_lib.models.status import Status from admin_api_lib.utils.utils import sanitize_document_name from admin_api_lib.impl.api_endpoints import default_file_uploader -@ pytest.fixture + +@pytest.fixture def mocks(): extractor_api = MagicMock() key_value_store = MagicMock() @@ -24,7 +24,8 @@ def mocks(): information_mapper = MagicMock() return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_handle_file_upload_success(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks # setup mocks @@ -38,24 +39,39 @@ async def test_handle_file_upload_success(mocks): information_mapper.document2rag_information_piece.return_value = dummy_rag uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) - await uploader._handle_source_upload("s3path", "file:doc1", "doc1.txt", "http://base") + upload_filename = "file:doc1" + + await uploader._handle_source_upload("s3path", upload_filename, "doc1.txt", "http://base") - key_value_store.upsert.assert_any_call("file:doc1", Status.READY) + key_value_store.upsert.assert_any_call(upload_filename, Status.READY) rag_api.upload_information_piece.assert_called_once_with([dummy_rag]) - document_deleter.adelete_document.assert_awaited_once_with("file:doc1") + document_deleter.adelete_document.assert_awaited_once_with(upload_filename, remove_from_key_value_store=False) -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_handle_file_upload_no_info_pieces(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks extractor_api.extract_from_file_post.return_value = [] uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) filename = "file:doc2" await uploader._handle_source_upload("s3path", filename, "doc2.txt", "http://base") @@ -64,7 +80,8 @@ async def test_handle_file_upload_no_info_pieces(mocks): information_mapper.extractor_information_piece2document.assert_not_called() rag_api.upload_information_piece.assert_not_called() -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_upload_file_already_processing_raises_error(mocks): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks base_url = "http://base" @@ -75,15 +92,22 @@ async def test_upload_file_already_processing_raises_error(mocks): key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) with pytest.raises(HTTPException): await uploader.upload_file(base_url, file) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) -@ pytest.mark.asyncio + +@pytest.mark.asyncio async def test_upload_file_starts_thread(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks base_url = "http://base" @@ -94,11 +118,17 @@ async def test_upload_file_starts_thread(mocks, monkeypatch): source_name = f"file:{sanitize_document_name(file.filename)}" dummy_thread = MagicMock() - monkeypatch.setattr(default_file_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) + monkeypatch.setattr(default_file_uploader, "Thread", lambda *args, **kwargs: dummy_thread) uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + file_service=MagicMock(), ) await uploader.upload_file(base_url, file) @@ -106,37 +136,3 @@ async def test_upload_file_starts_thread(mocks, monkeypatch): key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) dummy_thread.start.assert_called_once() -@ pytest.mark.asyncio -async def test_upload_file_timeout_error(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks - base_url = "http://base" - file = MagicMock(spec=UploadFile) - file.filename = "slow.txt" - file.read = AsyncMock(return_value=b"") - key_value_store.get_all.return_value = [] - source_name = f"file:{sanitize_document_name(file.filename)}" - - # fast fake handler that sleeps long - async def fake_handle(self, s3_path, source_name_arg, filename, base_url_arg): - await asyncio.sleep(3600) - monkeypatch.setattr( - default_file_uploader.DefaultFileUploader, - '_handle_source_upload', - fake_handle - ) - def FakeThread(target, args=(), **kwargs): - class T: - def start(self_inner): target(*args) - def is_alive(self_inner): return False - return T() - monkeypatch.setattr(default_file_uploader, 'Thread', FakeThread) - - uploader = DefaultFileUploader( - extractor_api, key_value_store, information_enhancer, chunker, - document_deleter, rag_api, information_mapper, file_service=MagicMock() - ) - - await uploader.upload_file(base_url, file, timeout=0.1) - calls = [c.args for c in key_value_store.upsert.call_args_list] - assert (source_name, Status.PROCESSING) in calls - assert (source_name, Status.ERROR) in calls diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index 9210a0c..9c47416 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -1,14 +1,16 @@ +# ignore: + import asyncio import pytest from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException -import threading, time from admin_api_lib.impl.api_endpoints.default_source_uploader import DefaultSourceUploader from admin_api_lib.models.status import Status from admin_api_lib.utils.utils import sanitize_document_name from admin_api_lib.impl.api_endpoints import default_source_uploader + @pytest.fixture def mocks(): extractor_api = MagicMock() @@ -51,7 +53,7 @@ async def test_handle_source_upload_success(mocks): key_value_store.upsert.assert_any_call("source1", Status.READY) rag_api.upload_information_piece.assert_called_once_with([dummy_rag_piece]) - document_deleter.adelete_document.assert_awaited_once_with("source1") + document_deleter.adelete_document.assert_awaited_once_with("source1", remove_from_key_value_store=False) @pytest.mark.asyncio @@ -90,16 +92,16 @@ async def test_upload_source_already_processing_raises_error(mocks): await uploader.upload_source(source_type, name, []) key_value_store.upsert.assert_any_call(source_name, Status.ERROR) + @pytest.mark.asyncio async def test_upload_source_no_timeout(mocks, monkeypatch): extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks key_value_store.get_all.return_value = [] source_type = "typeZ" name = "quick" - source_name = f"{source_type}:{sanitize_document_name(name)}" # patch Thread so no actual background work is done dummy_thread = MagicMock() - monkeypatch.setattr(default_source_uploader, 'Thread', lambda *args, **kwargs: dummy_thread) + monkeypatch.setattr(default_source_uploader, "Thread", lambda *args, **kwargs: dummy_thread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) @@ -118,24 +120,26 @@ async def test_upload_source_timeout_error(mocks, monkeypatch): source_type = "typeTimeout" name = "slow" source_name = f"{source_type}:{sanitize_document_name(name)}" + # monkey-patch the handler to sleep so that timeout triggers async def fake_handle(self, source_name_arg, source_type_arg, kwargs_arg): await asyncio.sleep(3600) + # patch handler and Thread to trigger timeout synchronously - monkeypatch.setattr( - default_source_uploader.DefaultSourceUploader, - '_handle_source_upload', - fake_handle - ) + monkeypatch.setattr(default_source_uploader.DefaultSourceUploader, "_handle_source_upload", fake_handle) + def FakeThread(target, args=(), **kwargs): # this ensures serial execution, so that the error status can be checked class T: - def start(self_inner): + def start(self): target(*args) - def is_alive(self_inner): + + def is_alive(self): return False + return T() - monkeypatch.setattr(default_source_uploader, 'Thread', FakeThread) + + monkeypatch.setattr(default_source_uploader, "Thread", FakeThread) uploader = DefaultSourceUploader( extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper ) diff --git a/admin-api-lib/tests/test_confluence_integration.py b/admin-api-lib/tests/test_confluence_integration.py new file mode 100644 index 0000000..e69de29 diff --git a/admin-api-lib/tests/test_thread_management.py b/admin-api-lib/tests/test_thread_management.py new file mode 100644 index 0000000..e69de29 diff --git a/extractor-api-lib/poetry.lock b/extractor-api-lib/poetry.lock index 0da6009..c750e96 100644 --- a/extractor-api-lib/poetry.lock +++ b/extractor-api-lib/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1933,21 +1933,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -4877,4 +4877,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "9dd34ca058d74aea96a5ebfc2d712ec2a36521b310858dcb5e5569bb2dd16333" +content-hash = "a25945d5914b2ad6c32bcd50f8b787c00e41df7e09fdb3c991f48cb9e9c15c72" diff --git a/extractor-api-lib/pyproject.toml b/extractor-api-lib/pyproject.toml index 4d6ac63..a648858 100644 --- a/extractor-api-lib/pyproject.toml +++ b/extractor-api-lib/pyproject.toml @@ -92,7 +92,7 @@ html5lib = "^1.1" langchain-community = "^0.3.23" atlassian-python-api = "^4.0.3" markdownify = "^1.1.0" -langchain-core = "^0.3.58" +langchain-core = "0.3.63" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" diff --git a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py index acb6022..800c214 100644 --- a/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py +++ b/extractor-api-lib/src/extractor_api_lib/apis/extractor_api_base.py @@ -29,7 +29,7 @@ async def extract_from_file_post( self, extraction_request: ExtractionRequest, ) -> List[InformationPiece]: - """ + """ Extract information from a file based on the provided extraction request. Parameters diff --git a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py index 8e08ad6..10d8cd5 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py @@ -51,7 +51,9 @@ async def aextract_information( list[InformationPiece] A list of extracted information pieces. """ - correct_extractors = [x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type] + correct_extractors = [ + x for x in self._available_extractors if extraction_parameters.source_type == x.extractor_type + ] if not correct_extractors: raise ValueError(f"No extractor found for type {extraction_parameters.source_type}") results = await correct_extractors[-1].aextract_content(extraction_parameters) diff --git a/rag-core-lib/poetry.lock b/rag-core-lib/poetry.lock index 90b3fb9..4487b8e 100644 --- a/rag-core-lib/poetry.lock +++ b/rag-core-lib/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1623,21 +1623,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -3384,4 +3384,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "2aa5df2f5304dfb56d7adfeeb4f8817ecf9d7eaaadc5af9127875a5aa442c7d0" +content-hash = "265d9eb8b910f4831f5e5e7e78a0e9b3b010793fed03d30a96393a2f8c1792db" diff --git a/rag-core-lib/pyproject.toml b/rag-core-lib/pyproject.toml index c63b316..2ca85e3 100644 --- a/rag-core-lib/pyproject.toml +++ b/rag-core-lib/pyproject.toml @@ -21,6 +21,7 @@ requests-oauthlib = "^2.0.0" langfuse = "^2.60.4" deprecated = "^1.2.18" openai = "^1.77.0" +langchain-core = "0.3.63" [tool.poetry.group.dev.dependencies] From 7f1df26ed0db4b8f0fe439b22be86a984bb5853f Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:40:18 +0200 Subject: [PATCH 30/56] refactor: add TODO for implementing timeout in thread handling for file uploads --- .../admin_api_lib/impl/api_endpoints/default_file_uploader.py | 2 +- admin-api-lib/tests/default_file_uploader_test.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index b558f11..2c6f868 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -105,7 +105,7 @@ async def upload_file( s3_path = await self._asave_new_document(content, file.filename, source_name) thread = Thread( target=lambda: run(self._handle_source_upload(s3_path, source_name, file.filename, base_url)) - ) #TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior + ) # TODO: add timeout. same logic like in default_source_uploader leaded to strange behavior thread.start() self._background_threads.append(thread) except ValueError as e: diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py index 19318e9..e76b9b5 100644 --- a/admin-api-lib/tests/default_file_uploader_test.py +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -135,4 +135,3 @@ async def test_upload_file_starts_thread(mocks, monkeypatch): key_value_store.upsert.assert_any_call(source_name, Status.PROCESSING) dummy_thread.start.assert_called_once() - From 8a6d4f16f4b624ac3d816e74470aef4a25791cec Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:40:46 +0200 Subject: [PATCH 31/56] refactor: remove unused asyncio import from default_file_uploader_test.py --- admin-api-lib/tests/default_file_uploader_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/admin-api-lib/tests/default_file_uploader_test.py b/admin-api-lib/tests/default_file_uploader_test.py index e76b9b5..079a935 100644 --- a/admin-api-lib/tests/default_file_uploader_test.py +++ b/admin-api-lib/tests/default_file_uploader_test.py @@ -1,4 +1,3 @@ -import asyncio import pytest from unittest.mock import AsyncMock, MagicMock from fastapi import HTTPException From 5af5c76717d4b9bbdad9939349798b0b47f22b2e Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 07:44:34 +0200 Subject: [PATCH 32/56] refactor: remove unused thread management documentation and example files --- admin-api-lib/docs/thread_management.md | 0 admin-api-lib/examples/thread_management_example.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 admin-api-lib/docs/thread_management.md delete mode 100644 admin-api-lib/examples/thread_management_example.py diff --git a/admin-api-lib/docs/thread_management.md b/admin-api-lib/docs/thread_management.md deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/examples/thread_management_example.py b/admin-api-lib/examples/thread_management_example.py deleted file mode 100644 index e69de29..0000000 From fa2f9282e3403e1de5a5729056765a0c598a71a5 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:12:17 +0200 Subject: [PATCH 33/56] chore: update poetry.lock and pyproject.toml for dependency version changes and configuration adjustments --- admin-api-lib/tests/test_default_source_uploader.py | 0 rag-core-api/poetry.lock | 13 +++++++------ rag-core-api/pyproject.toml | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) delete mode 100644 admin-api-lib/tests/test_default_source_uploader.py diff --git a/admin-api-lib/tests/test_default_source_uploader.py b/admin-api-lib/tests/test_default_source_uploader.py deleted file mode 100644 index e69de29..0000000 diff --git a/rag-core-api/poetry.lock b/rag-core-api/poetry.lock index e5ea53f..9812609 100644 --- a/rag-core-api/poetry.lock +++ b/rag-core-api/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1959,21 +1959,21 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10" [[package]] name = "langchain-core" -version = "0.3.58" +version = "0.3.63" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "langchain_core-0.3.58-py3-none-any.whl", hash = "sha256:266f90d2a079fe9510190ad3be88bd993baad43e6cee0f822a883767a4bfdd5b"}, - {file = "langchain_core-0.3.58.tar.gz", hash = "sha256:6ee2282b02fa65bf4ee1afa869d431505536757ff2f1f9f0b432d8ca755d66c6"}, + {file = "langchain_core-0.3.63-py3-none-any.whl", hash = "sha256:f91db8221b1bc6808f70b2e72fded1a94d50ee3f1dff1636fb5a5a514c64b7f5"}, + {file = "langchain_core-0.3.63.tar.gz", hash = "sha256:e2e30cfbb7684a5a0319f6cbf065fc3c438bfd1060302f085a122527890fb01e"}, ] [package.dependencies] jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.1.125,<0.4" +langsmith = ">=0.1.126,<0.4" packaging = ">=23.2,<25" -pydantic = {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""} +pydantic = ">=2.7.4" PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" typing-extensions = ">=4.7" @@ -3843,6 +3843,7 @@ deprecated = "^1.2.18" flashrank = "^0.2.10" langchain = "^0.3.25" langchain-community = "0.3.23" +langchain-core = "0.3.63" langfuse = "^2.60.4" oauthlib = "^3.2.2" openai = "^1.77.0" diff --git a/rag-core-api/pyproject.toml b/rag-core-api/pyproject.toml index 4fd633c..2194a90 100644 --- a/rag-core-api/pyproject.toml +++ b/rag-core-api/pyproject.toml @@ -118,8 +118,8 @@ known_local_folder = ["rag_core_api", "rag_core_lib"] max-line-length = 120 [tool.pytest.ini_options] -log_cli = 1 +log_cli = true log_cli_level = "DEBUG" -pythonpath = "src" -testpaths = "src/tests" +pythonpath = ["src", "tests"] +testpaths = "tests" From 8dc79900949b7997ecba92442052712a60c3dd4f Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:16:19 +0200 Subject: [PATCH 34/56] chore: Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7becbcd..3d3edf6 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ The extracted information will be summarized using a LLM. The summary, as well a #### `/upload_source` -Loads all the content from an abritrary non-file source using the [document-extractor](#3-extractor-api-lib). +Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). From 57788eb4829997f05e74ce447cf908619aa142a4 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:21:52 +0200 Subject: [PATCH 35/56] fix: correct spelling of 'arbitrary' in README and update query parameter alias in upload_source function --- README.md | 2 +- admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py | 3 --- admin-api-lib/src/admin_api_lib/apis/admin_api.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7becbcd..3d3edf6 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ The extracted information will be summarized using a LLM. The summary, as well a #### `/upload_source` -Loads all the content from an abritrary non-file source using the [document-extractor](#3-extractor-api-lib). +Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index f45636e..2260bd4 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -15,7 +15,6 @@ async def upload_file( self, base_url: str, file: UploadFile, - timeout: Optional[float], ) -> None: """ Uploads a source file for content extraction. @@ -26,8 +25,6 @@ async def upload_file( The base url of the service. Is used to determine the download link of the file. file : UploadFile The file to process. - timeout : float, optional - Timeout for the operation. Returns ------- diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index a323bd6..d67a246 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -176,7 +176,7 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - source_type: StrictStr = Query(None, description="", alias="type"), + source_type: StrictStr = Query(None, description="", alias="sourceType"), name: StrictStr = Query(None, description="", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description=""), ) -> None: From d942cf71a641838262630198a8364201574e5fba Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:24:25 +0200 Subject: [PATCH 36/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d3edf6..f246276 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,7 @@ The type of information that is extracted will vary depending on the source, the - `TEXT`: plain text - `TABLE`: data in tabular form found in the document -- `IMAGE`: data in tabular form found in the document +- `IMAGE`: image found in the document ### 3.3 Replaceable parts From 21d10d9246a6cde6c40e43ca29b1a391b7c7de51 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:27:45 +0200 Subject: [PATCH 37/56] refactor: remove unused import and enhance query parameter descriptions in upload_source function --- .../src/admin_api_lib/api_endpoints/file_uploader.py | 1 - admin-api-lib/src/admin_api_lib/apis/admin_api.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py index 2260bd4..3ab7464 100644 --- a/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/api_endpoints/file_uploader.py @@ -1,7 +1,6 @@ """Module for the upload file endpoint.""" from abc import abstractmethod -from typing import Optional from fastapi import UploadFile diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index d67a246..a8979af 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -176,9 +176,9 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - source_type: StrictStr = Query(None, description="", alias="sourceType"), - name: StrictStr = Query(None, description="", alias="name"), - key_value_pair: List[KeyValuePair] = Body(None, description=""), + source_type: StrictStr = Query(None, description="The type of the source", alias="sourceType"), + name: StrictStr = Query(None, description="The name of the source", alias="name"), + key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"), ) -> None: """ Uploads user selected sources. From bc503f24917f0c95fe36ac3f428488544b3f0269 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:31:29 +0200 Subject: [PATCH 38/56] refactor: remove timeout parameter from DefaultFileUploader and delete unused managed_page_summary_enhancer module --- .../admin_api_lib/impl/api_endpoints/default_file_uploader.py | 1 - .../impl/information_enhancer/managed_page_summary_enhancer.py | 0 2 files changed, 1 deletion(-) delete mode 100644 admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 2c6f868..5217501 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -78,7 +78,6 @@ async def upload_file( self, base_url: str, file: UploadFile, - timeout: Optional[float] = 3600.0, ) -> None: """ Uploads a source file for content extraction. diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/managed_page_summary_enhancer.py deleted file mode 100644 index e69de29..0000000 From a5523fbb0ecfadc9baa362ff80007a6c57c005b0 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 08:33:07 +0200 Subject: [PATCH 39/56] refactor: remove unused thread_diagnostics.py file --- admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py diff --git a/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py b/admin-api-lib/src/admin_api_lib/utils/thread_diagnostics.py deleted file mode 100644 index e69de29..0000000 From 5dafd3e20bb25464b203d967cbbd39da1a650e28 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 10:03:37 +0200 Subject: [PATCH 40/56] feat: add SourceUploaderSettings for configurable timeout and refactor DefaultSourceUploader to use it refactor: update JSON serialization in ExtractionParameters, ExtractionRequest, InformationPiece, and KeyValuePair models refactor: remove unused test files for confluence and thread management integration --- .../api_endpoints/default_source_uploader.py | 6 ++-- .../impl/settings/source_uploader_settings.py | 23 ++++++++++++++ .../tests/test_confluence_integration.py | 0 admin-api-lib/tests/test_thread_management.py | 0 .../impl/extractor_api_impl.py | 30 +++++++++++++++++++ .../models/extraction_parameters.py | 3 +- .../models/extraction_request.py | 3 +- .../models/information_piece.py | 3 +- .../models/key_value_pair.py | 3 +- 9 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py delete mode 100644 admin-api-lib/tests/test_confluence_integration.py delete mode 100644 admin-api-lib/tests/test_thread_management.py diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py index 71b09de..bc891b7 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py @@ -9,6 +9,7 @@ from admin_api_lib.extractor_api_client.openapi_client.api.extractor_api import ExtractorApi from admin_api_lib.extractor_api_client.openapi_client.models.extraction_parameters import ExtractionParameters +from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings from admin_api_lib.models.key_value_pair import KeyValuePair from admin_api_lib.rag_backend_client.openapi_client.api.rag_api import RagApi from admin_api_lib.impl.mapper.informationpiece2document import InformationPiece2Document @@ -37,6 +38,7 @@ def __init__( document_deleter: DocumentDeleter, rag_api: RagApi, information_mapper: InformationPiece2Document, + settings: SourceUploaderSettings, ): """ Initialize the DefaultSourceUploader. @@ -67,13 +69,13 @@ def __init__( self._chunker = chunker self._document_deleter = document_deleter self._background_threads = [] + self._settings = settings async def upload_source( self, source_type: StrictStr, name: StrictStr, kwargs: list[KeyValuePair], - timeout: float = 3600.0, ) -> None: """ Uploads the parameters for source content extraction. @@ -101,7 +103,7 @@ async def upload_source( self._check_if_already_in_processing(source_name) self._key_value_store.upsert(source_name, Status.PROCESSING) - thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, timeout)) + thread = Thread(target=self._thread_worker, args=(source_name, source_type, kwargs, self._settings.timeout)) thread.start() self._background_threads.append(thread) except ValueError as e: diff --git a/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py b/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py new file mode 100644 index 0000000..70f18bd --- /dev/null +++ b/admin-api-lib/src/admin_api_lib/impl/settings/source_uploader_settings.py @@ -0,0 +1,23 @@ +"""Contains settings regarding the SourceUploader.""" + +from pydantic import Field +from pydantic_settings import BaseSettings + + +class SourceUploaderSettings(BaseSettings): + """ + Contains settings regarding the SourceUploader. + + Attributes + ---------- + timeout : float + The timeout for the SourceUploader. + """ + + class Config: + """Config class for reading Fields from env.""" + + env_prefix = "SOURCE_UPLOADER_" + case_sensitive = False + + timeout: float = Field(default=3600.0, description="Timeout for the SourceUploader in seconds.") diff --git a/admin-api-lib/tests/test_confluence_integration.py b/admin-api-lib/tests/test_confluence_integration.py deleted file mode 100644 index e69de29..0000000 diff --git a/admin-api-lib/tests/test_thread_management.py b/admin-api-lib/tests/test_thread_management.py deleted file mode 100644 index e69de29..0000000 diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py index 276f720..b1aa8c1 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractor_api_impl.py @@ -21,6 +21,21 @@ async def extract_from_file_post( extraction_request: ExtractionRequest, extractor: FileExtractor = Depends(Provide[DependencyContainer.general_file_extractor]), ) -> list[InformationPiece]: + """ + Extract information from a file based on the provided extraction request. + + Parameters + ---------- + extraction_request : ExtractionRequest + The request containing details about the extraction process. + extractor : FileExtractor, optional + The file extractor dependency. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces. + """ return await extractor.aextract_information(extraction_request) async def extract_from_source( @@ -28,4 +43,19 @@ async def extract_from_source( extraction_parameters: ExtractionParameters, extractor: SourceExtractor = Depends(Provide[DependencyContainer.source_extractor]), ) -> list[InformationPiece]: + """ + Extract information from a source (e.g. confluence) asynchronously. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + Parameters required to extract information from source. + extractor : SourceExtractor, optional + The source extractor instance. + + Returns + ------- + list[InformationPiece] + A list of extracted information pieces. + """ return await extractor.aextract_information(extraction_parameters) diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py index e18a452..e903b4e 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_parameters.py @@ -50,8 +50,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py index 769b658..3befa42 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py +++ b/extractor-api-lib/src/extractor_api_lib/models/extraction_request.py @@ -46,8 +46,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py index 8890a13..3ffb308 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/models/information_piece.py @@ -51,8 +51,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: diff --git a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py index f751313..3cba505 100644 --- a/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py +++ b/extractor-api-lib/src/extractor_api_lib/models/key_value_pair.py @@ -46,8 +46,7 @@ def to_str(self) -> str: def to_json(self) -> str: """Returns the JSON representation of the model using alias""" - # TODO: pydantic v2: use .model_dump_json(by_alias=True, exclude_unset=True) instead - return json.dumps(self.to_dict()) + return self.model_dump_json(by_alias=True, exclude_unset=True) @classmethod def from_json(cls, json_str: str) -> Self: From 4ab029b00dc180a43b6cb4f27a6ac4ea447b07b8 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 10:43:15 +0200 Subject: [PATCH 41/56] refactor: remove unused import of Optional in default_file_uploader.py --- .../admin_api_lib/impl/api_endpoints/default_file_uploader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py index 5217501..fa4a27a 100644 --- a/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +++ b/admin-api-lib/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py @@ -2,7 +2,6 @@ from pathlib import Path import traceback from threading import Thread -from typing import Optional import urllib import tempfile from contextlib import suppress From 7f53875890aa06353b99933aca317bb553dd821b Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 11:04:46 +0200 Subject: [PATCH 42/56] feat: add SourceUploaderSettings to DependencyContainer and update upload_source function --- admin-api-lib/src/admin_api_lib/apis/admin_api.py | 2 +- admin-api-lib/src/admin_api_lib/dependency_container.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/admin-api-lib/src/admin_api_lib/apis/admin_api.py b/admin-api-lib/src/admin_api_lib/apis/admin_api.py index a8979af..c348b5d 100644 --- a/admin-api-lib/src/admin_api_lib/apis/admin_api.py +++ b/admin-api-lib/src/admin_api_lib/apis/admin_api.py @@ -176,7 +176,7 @@ async def upload_file( response_model_by_alias=True, ) async def upload_source( - source_type: StrictStr = Query(None, description="The type of the source", alias="sourceType"), + source_type: StrictStr = Query(None, description="The type of the source"), name: StrictStr = Query(None, description="The name of the source", alias="name"), key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"), ) -> None: diff --git a/admin-api-lib/src/admin_api_lib/dependency_container.py b/admin-api-lib/src/admin_api_lib/dependency_container.py index 640ea72..fd5e0a1 100644 --- a/admin-api-lib/src/admin_api_lib/dependency_container.py +++ b/admin-api-lib/src/admin_api_lib/dependency_container.py @@ -49,6 +49,7 @@ from admin_api_lib.impl.settings.key_value_settings import KeyValueSettings from admin_api_lib.impl.settings.rag_api_settings import RAGAPISettings from admin_api_lib.impl.settings.s3_settings import S3Settings +from admin_api_lib.impl.settings.source_uploader_settings import SourceUploaderSettings from admin_api_lib.impl.settings.summarizer_settings import SummarizerSettings from admin_api_lib.impl.summarizer.langchain_summarizer import LangchainSummarizer from admin_api_lib.prompt_templates.summarize_prompt import SUMMARIZE_PROMPT @@ -85,6 +86,7 @@ class DependencyContainer(DeclarativeContainer): rag_api_settings = RAGAPISettings() key_value_store_settings = KeyValueSettings() summarizer_settings = SummarizerSettings() + source_uploader_settings = SourceUploaderSettings() key_value_store = Singleton(FileStatusKeyValueStore, key_value_store_settings) file_service = Singleton(S3Service, s3_settings=s3_settings) @@ -167,6 +169,7 @@ class DependencyContainer(DeclarativeContainer): chunker=chunker, key_value_store=key_value_store, document_deleter=document_deleter, + settings=source_uploader_settings, ) file_uploader = Singleton( From 97bdb25ee46f86682c9536e721d7265a63d8e9f0 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 2 Jun 2025 13:46:11 +0200 Subject: [PATCH 43/56] docs: update README to clarify upload behavior and default timeout configuration --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f246276..38a9349 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ The extracted information will be summarized using a LLM. The summary, as well a Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). -The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). +The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). An is configured. Defaults to 3600 seconds (1 hour). Can be adjusted by values in the helm chart. ### 2.3 Replaceable parts From a6209daa3f3c0385998bce30fba49a3e9f250b09 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 3 Jun 2025 13:55:39 +0200 Subject: [PATCH 44/56] feat: implement SitemapExtractor and SitemapLangchainDocument2InformationPiece classes --- .../impl/extractors/sitemap_extractor.py | 62 +++++++++++++++++++ .../sitemap_document2information_piece.py | 6 ++ 2 files changed, 68 insertions(+) create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py new file mode 100644 index 0000000..5fa193c --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -0,0 +1,62 @@ +"""Module for the DefaultSitemapExtractor class.""" + +from langchain_community.document_loaders import SitemapLoader + +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.extractors.information_extractor import InformationExtractor +from extractor_api_lib.impl.mapper.sitemap_document2information_piece import ( + SitemapLangchainDocument2InformationPiece, +) + + +class SitemapExtractor(InformationExtractor): + """Implementation of the InformationExtractor interface for confluence.""" + + def __init__( + self, + mapper: SitemapLangchainDocument2InformationPiece, + ): + """ + Initialize the SitemapExtractor. + + Parameters + ---------- + mapper : SitemapLangchainDocument2InformationPiece + An instance of SitemapLangchainDocument2InformationPiece used for mapping langchain documents + to information pieces. + """ + self.mapper = mapper + + @property + def extractor_type(self) -> ExtractorTypes: + return ExtractorTypes.SITEMAP + + async def aextract_content( + self, + extraction_parameters: ExtractionParameters, + ) -> list[InternalInformationPiece]: + """ + Asynchronously extracts information pieces from Sitemap. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters required to connect to and extract data from Sitemap. + + Returns + ------- + list[InternalInformationPiece] + A list of information pieces extracted from Sitemap. + """ + # Convert list of key value pairs to dict + confluence_loader_parameters = { + x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs + } + # Drop the document_name parameter as it is not used by the SitemapLoader + if "document_name" in confluence_loader_parameters: + confluence_loader_parameters.pop("document_name", None) + document_loader = ConfluenceLoader(**confluence_loader_parameters) + documents = document_loader.load() + return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py new file mode 100644 index 0000000..5702cb7 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py @@ -0,0 +1,6 @@ + + + +class SitemapLangchainDocument2InformationPiece: + def __init__(self): + raise NotImplementedError("This method is not implemented.") From 4465591012fa79063c743c9a6d6c4ad64e3d870e Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 3 Jun 2025 13:55:46 +0200 Subject: [PATCH 45/56] feat: add SitemapExtractor and SitemapLangchainDocument2InformationPiece classes to support sitemap extraction --- .../extractor_api_lib/dependency_container.py | 7 +- .../impl/extractors/confluence_extractor.py | 2 +- .../impl/extractors/sitemap_extractor.py | 23 ++++-- .../sitemap_document2information_piece.py | 73 ++++++++++++++++++- .../impl/types/extractor_types.py | 1 + 5 files changed, 96 insertions(+), 10 deletions(-) diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index ad671d9..bcbc1a8 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -9,6 +9,7 @@ from extractor_api_lib.impl.extractors.file_extractors.pdf_extractor import PDFExtractor from extractor_api_lib.impl.extractors.file_extractors.xml_extractor import XMLExtractor from extractor_api_lib.impl.api_endpoints.general_file_extractor import GeneralFileExtractor +from extractor_api_lib.impl.extractors.sitemap_extractor import SitemapExtractor from extractor_api_lib.impl.file_services.s3_service import S3Service from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, @@ -16,6 +17,7 @@ from extractor_api_lib.impl.mapper.internal2external_information_piece import ( Internal2ExternalInformationPiece, ) +from extractor_api_lib.impl.mapper.sitemap_document2information_piece import SitemapLangchainDocument2InformationPiece from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings from extractor_api_lib.impl.settings.s3_settings import S3Settings from extractor_api_lib.impl.table_converter.dataframe2markdown import DataFrame2Markdown @@ -36,13 +38,14 @@ class DependencyContainer(DeclarativeContainer): intern2external = Singleton(Internal2ExternalInformationPiece) langchain_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece) + sitemap_document2information_piece = Singleton(SitemapLangchainDocument2InformationPiece) file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor) general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external) confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - + sitemap_extractor = Singleton(SitemapExtractor, mapper=sitemap_document2information_piece) source_extractor = Singleton( GeneralSourceExtractor, mapper=intern2external, - available_extractors=List(confluence_extractor), + available_extractors=List(confluence_extractor, sitemap_extractor), ) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index f1c15a6..38f9fae 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -48,7 +48,7 @@ async def aextract_content( Returns ------- list[InternalInformationPiece] - A list of information pieces extracted from Confluence. + A list of information pieces extracted from Confluence. """ # Convert list of key value pairs to dict confluence_loader_parameters = { diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py index 5fa193c..a4e8b00 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -1,6 +1,8 @@ """Module for the DefaultSitemapExtractor class.""" from langchain_community.document_loaders import SitemapLoader +import asyncio + from extractor_api_lib.impl.types.extractor_types import ExtractorTypes from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece @@ -51,12 +53,23 @@ async def aextract_content( A list of information pieces extracted from Sitemap. """ # Convert list of key value pairs to dict - confluence_loader_parameters = { + sitemap_loader_parameters = { x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs } # Drop the document_name parameter as it is not used by the SitemapLoader - if "document_name" in confluence_loader_parameters: - confluence_loader_parameters.pop("document_name", None) - document_loader = ConfluenceLoader(**confluence_loader_parameters) - documents = document_loader.load() + if "document_name" in sitemap_loader_parameters: + sitemap_loader_parameters.pop("document_name", None) + document_loader = SitemapLoader(**sitemap_loader_parameters) + documents = [] + try: + # Run the synchronous iteration in a thread to avoid event loop conflicts + def load_documents(): + docs = [] + for doc in document_loader.lazy_load(): + docs.append(doc) + return docs + + documents = await asyncio.get_event_loop().run_in_executor(None, load_documents) + except Exception as e: + raise ValueError(f"Failed to load documents from Sitemap: {e}") return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py index 5702cb7..72d37c0 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py @@ -1,6 +1,75 @@ +"""Module for the SitemapLangchainDocument2InformationPiece class.""" +import uuid +from langchain_core.documents import Document as LangchainDocument + +from extractor_api_lib.impl.utils.utils import hash_datetime +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.content_type import ContentType class SitemapLangchainDocument2InformationPiece: - def __init__(self): - raise NotImplementedError("This method is not implemented.") + """ + A class to map a LangchainDocument to an InformationPiece with Sitemap-specific metadata. + + Attributes + ---------- + USE_CASE_DOCUMENT_URL_KEY : str + Key for the document URL in the use case. + SITEMAP_LOADER_SOURCE_URL_KEY : str + Key for the source URL in the Sitemap loader. + SITEMAP_LOADER_TITLE_KEY : str + Key for the title in the Sitemap loader. + USER_CASE_PAGE_KEY : str + Key for the page in the use case. + USE_CASE_RELATED_KEY : str + Key for related information in the use case. + DOCUMENT_KEY : str + Key for the document. + """ + + USE_CASE_DOCUMENT_URL_KEY = "document_url" + SITEMAP_LOADER_SOURCE_URL_KEY = "source" + SITEMAP_LOADER_TITLE_KEY = "title" + USER_CASE_PAGE_KEY = "page" + USE_CASE_RELATED_KEY = "related" + DOCUMENT_KEY = "document" + ID_KEY = "id" + + def map_document2informationpiece( + self, document: LangchainDocument, document_name: str + ) -> InternalInformationPiece: + """ + Map a LangchainDocument to an InformationPiece. + + Parameters + ---------- + document : LangchainDocument + The document to be mapped. + + Returns + ------- + InformationPiece + The mapped information piece containing page content, type, and metadata. + + Raises + ------ + ValueError + If Sitemap parameters are not set before mapping documents. + """ + meta = self._map_meta(document.metadata, document_name) + return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) + + def _map_meta(self, internal: dict, document_name: str) -> dict: + metadata = {} + for key, value in internal.items(): + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.SITEMAP_LOADER_SOURCE_URL_KEY else key] = value + + page_title_matches = [v for k, v in metadata.items() if k == self.SITEMAP_LOADER_TITLE_KEY] + page_title = page_title_matches[0] if page_title_matches else "Unknown Title" + + metadata[self.USER_CASE_PAGE_KEY] = page_title + metadata[self.DOCUMENT_KEY] = document_name + metadata[self.USE_CASE_RELATED_KEY] = [] + metadata[self.ID_KEY] = hash_datetime() + return metadata diff --git a/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py index 8a9a403..c4efaa4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/types/extractor_types.py @@ -6,4 +6,5 @@ class ExtractorTypes(StrEnum): FILE = "file" CONFLUENCE = "confluence" + SITEMAP = "sitemap" NONE = "None" From 0cbd5e395b6d33114879502ec1943554ab0639ff Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 3 Jun 2025 16:03:38 +0200 Subject: [PATCH 46/56] feat: add fake-useragent dependency and enhance SitemapExtractor to handle JSON header templates --- extractor-api-lib/poetry.lock | 14 +++++++++++++- extractor-api-lib/pyproject.toml | 1 + .../impl/extractors/sitemap_extractor.py | 17 +++++++++++++---- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/extractor-api-lib/poetry.lock b/extractor-api-lib/poetry.lock index c750e96..a1bc91d 100644 --- a/extractor-api-lib/poetry.lock +++ b/extractor-api-lib/poetry.lock @@ -1042,6 +1042,18 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "fake-useragent" +version = "2.2.0" +description = "Up-to-date simple useragent faker with real world database" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "fake_useragent-2.2.0-py3-none-any.whl", hash = "sha256:67f35ca4d847b0d298187443aaf020413746e56acd985a611908c73dba2daa24"}, + {file = "fake_useragent-2.2.0.tar.gz", hash = "sha256:4e6ab6571e40cc086d788523cf9e018f618d07f9050f822ff409a4dfe17c16b2"}, +] + [[package]] name = "fastapi" version = "0.115.12" @@ -4877,4 +4889,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.13" -content-hash = "a25945d5914b2ad6c32bcd50f8b787c00e41df7e09fdb3c991f48cb9e9c15c72" +content-hash = "6ce3a0cec80ac06536113e984e478ebc8f3e398ba1226c0d6b920814d4796b49" diff --git a/extractor-api-lib/pyproject.toml b/extractor-api-lib/pyproject.toml index a648858..814e68a 100644 --- a/extractor-api-lib/pyproject.toml +++ b/extractor-api-lib/pyproject.toml @@ -93,6 +93,7 @@ langchain-community = "^0.3.23" atlassian-python-api = "^4.0.3" markdownify = "^1.1.0" langchain-core = "0.3.63" +fake-useragent = "^2.2.0" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py index a4e8b00..f9ded11 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -2,7 +2,7 @@ from langchain_community.document_loaders import SitemapLoader import asyncio - +import json from extractor_api_lib.impl.types.extractor_types import ExtractorTypes from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece @@ -53,9 +53,18 @@ async def aextract_content( A list of information pieces extracted from Sitemap. """ # Convert list of key value pairs to dict - sitemap_loader_parameters = { - x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs - } + sitemap_loader_parameters = {} + for x in extraction_parameters.kwargs: + if x.key == "header_template": + # Parse JSON string back to dictionary + try: + sitemap_loader_parameters[x.key] = json.loads(x.value) + except (json.JSONDecodeError, TypeError): + # If it's not a valid JSON string, treat as regular value + sitemap_loader_parameters[x.key] = x.value + else: + sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value + # Drop the document_name parameter as it is not used by the SitemapLoader if "document_name" in sitemap_loader_parameters: sitemap_loader_parameters.pop("document_name", None) From 387e465284d7a6f67afe4d138430aea957c8ac41 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 4 Jun 2025 15:42:19 +0200 Subject: [PATCH 47/56] feat: enhance SitemapExtractor to support JSON header templates and improve parameter handling --- .../impl/extractors/sitemap_extractor.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py index f9ded11..d54aee4 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -52,31 +52,33 @@ async def aextract_content( list[InternalInformationPiece] A list of information pieces extracted from Sitemap. """ - # Convert list of key value pairs to dict sitemap_loader_parameters = {} + headers = None + for x in extraction_parameters.kwargs: if x.key == "header_template": - # Parse JSON string back to dictionary + try: + headers = json.loads(x.value) + except (json.JSONDecodeError, TypeError): + headers = x.value if isinstance(x.value, dict) else None + elif x.key == "filter_urls": try: sitemap_loader_parameters[x.key] = json.loads(x.value) except (json.JSONDecodeError, TypeError): - # If it's not a valid JSON string, treat as regular value sitemap_loader_parameters[x.key] = x.value else: sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value - # Drop the document_name parameter as it is not used by the SitemapLoader + if headers: + sitemap_loader_parameters["header_template"] = headers + if "document_name" in sitemap_loader_parameters: sitemap_loader_parameters.pop("document_name", None) document_loader = SitemapLoader(**sitemap_loader_parameters) documents = [] try: - # Run the synchronous iteration in a thread to avoid event loop conflicts def load_documents(): - docs = [] - for doc in document_loader.lazy_load(): - docs.append(doc) - return docs + return list(document_loader.lazy_load()) documents = await asyncio.get_event_loop().run_in_executor(None, load_documents) except Exception as e: From 93534ca76f02bbdd6f592aa38c79d9cd6c335241 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 4 Jun 2025 16:04:52 +0200 Subject: [PATCH 48/56] feat: add comprehensive test suite for SitemapExtractor class --- extractor-api-lib/tests/dummy5_test.py | 7 - .../tests/sitemap_extractor_test.py | 478 ++++++++++++++++++ 2 files changed, 478 insertions(+), 7 deletions(-) delete mode 100644 extractor-api-lib/tests/dummy5_test.py create mode 100644 extractor-api-lib/tests/sitemap_extractor_test.py diff --git a/extractor-api-lib/tests/dummy5_test.py b/extractor-api-lib/tests/dummy5_test.py deleted file mode 100644 index 8bfd161..0000000 --- a/extractor-api-lib/tests/dummy5_test.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Module for the dummy test.""" - - -def test_dummy() -> None: - """Dummy test.""" - print("Dummy test.") - assert True diff --git a/extractor-api-lib/tests/sitemap_extractor_test.py b/extractor-api-lib/tests/sitemap_extractor_test.py new file mode 100644 index 0000000..1782358 --- /dev/null +++ b/extractor-api-lib/tests/sitemap_extractor_test.py @@ -0,0 +1,478 @@ +"""Comprehensive test suite for SitemapExtractor class.""" + +import asyncio +import json +import pytest +from unittest.mock import AsyncMock, MagicMock, patch, Mock +from langchain_core.documents import Document as LangchainDocument + +from extractor_api_lib.impl.extractors.sitemap_extractor import SitemapExtractor +from extractor_api_lib.impl.types.extractor_types import ExtractorTypes +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.impl.mapper.sitemap_document2information_piece import ( + SitemapLangchainDocument2InformationPiece, +) +from extractor_api_lib.impl.types.content_type import ContentType + + +class TestSitemapExtractor: + """Test class for SitemapExtractor.""" + + @pytest.fixture + def mock_mapper(self): + """Create a mock mapper for testing.""" + mapper = MagicMock(spec=SitemapLangchainDocument2InformationPiece) + mapper.map_document2informationpiece.return_value = InternalInformationPiece( + type=ContentType.TEXT, + metadata={"document": "test_doc", "id": "test_id", "related": []}, + page_content="Test content" + ) + return mapper + + @pytest.fixture + def sitemap_extractor(self, mock_mapper): + """Create a SitemapExtractor instance for testing.""" + return SitemapExtractor(mapper=mock_mapper) + + @pytest.fixture + def sample_extraction_parameters(self): + """Create sample extraction parameters.""" + return ExtractionParameters( + document_name="test_sitemap_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="filter_urls", value='["https://example.com/page1", "https://example.com/page2"]'), + KeyValuePair(key="header_template", value='{"User-Agent": "test-agent"}'), + KeyValuePair(key="max_depth", value="2"), + KeyValuePair(key="blocksize", value="10") + ] + ) + + def test_init(self, mock_mapper): + """Test SitemapExtractor initialization.""" + extractor = SitemapExtractor(mapper=mock_mapper) + assert extractor.mapper == mock_mapper + + def test_extractor_type(self, sitemap_extractor): + """Test that extractor_type returns SITEMAP.""" + assert sitemap_extractor.extractor_type == ExtractorTypes.SITEMAP + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_basic(self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters): + """Test basic content extraction functionality.""" + # Setup mock SitemapLoader + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + + # Create mock documents + mock_documents = [ + LangchainDocument( + page_content="Content from page 1", + metadata={"source": "https://example.com/page1", "title": "Page 1"} + ), + LangchainDocument( + page_content="Content from page 2", + metadata={"source": "https://example.com/page2", "title": "Page 2"} + ) + ] + + mock_loader_instance.lazy_load.return_value = iter(mock_documents) + + # Setup mock mapper + expected_info_pieces = [ + InternalInformationPiece( + type=ContentType.TEXT, + metadata={"document": "test_sitemap_doc", "id": "id1", "related": []}, + page_content="Content from page 1" + ), + InternalInformationPiece( + type=ContentType.TEXT, + metadata={"document": "test_sitemap_doc", "id": "id2", "related": []}, + page_content="Content from page 2" + ) + ] + + sitemap_extractor.mapper.map_document2informationpiece.side_effect = expected_info_pieces + + # Execute + result = await sitemap_extractor.aextract_content(sample_extraction_parameters) + + # Verify + assert len(result) == 2 + assert all(isinstance(piece, InternalInformationPiece) for piece in result) + + # Verify SitemapLoader was called with correct parameters + mock_sitemap_loader_class.assert_called_once() + call_args = mock_sitemap_loader_class.call_args[1] + + assert call_args["web_path"] == "https://example.com/sitemap.xml" + assert call_args["filter_urls"] == ["https://example.com/page1", "https://example.com/page2"] + assert call_args["header_template"] == {"User-Agent": "test-agent"} + assert call_args["max_depth"] == 2 + assert call_args["blocksize"] == 10 + + # Verify mapper was called for each document + assert sitemap_extractor.mapper.map_document2informationpiece.call_count == 2 + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_json_parsing_failure(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with invalid JSON in parameters falls back to string values.""" + # Create parameters with invalid JSON + extraction_params = ExtractionParameters( + document_name="test_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="filter_urls", value="invalid-json["), + KeyValuePair(key="header_template", value="invalid-json{") + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + result = await sitemap_extractor.aextract_content(extraction_params) + + # Verify + assert result == [] + + # Verify SitemapLoader was called with string fallback values + call_args = mock_sitemap_loader_class.call_args[1] + assert call_args["filter_urls"] == "invalid-json[" + assert "header_template" not in call_args # Should not be set due to invalid JSON + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_header_template_dict_value(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction when header_template is already a dict.""" + extraction_params = ExtractionParameters( + document_name="test_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="header_template", value={"User-Agent": "direct-dict"}) + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + result = await sitemap_extractor.aextract_content(extraction_params) + + # Verify + call_args = mock_sitemap_loader_class.call_args[1] + assert call_args["header_template"] == {"User-Agent": "direct-dict"} + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_document_name_removed(self, mock_sitemap_loader_class, sitemap_extractor): + """Test that document_name parameter is removed from SitemapLoader parameters.""" + extraction_params = ExtractionParameters( + document_name="test_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="document_name", value="should_be_removed") + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + await sitemap_extractor.aextract_content(extraction_params) + + # Verify document_name was removed from loader parameters + call_args = mock_sitemap_loader_class.call_args[1] + assert "document_name" not in call_args + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_numeric_parameters(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with numeric string parameters.""" + extraction_params = ExtractionParameters( + document_name="test_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="max_depth", value="5"), + KeyValuePair(key="blocksize", value="20"), + KeyValuePair(key="blocknum", value="1"), + KeyValuePair(key="non_numeric", value="not_a_number") + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + await sitemap_extractor.aextract_content(extraction_params) + + # Verify numeric conversion + call_args = mock_sitemap_loader_class.call_args[1] + assert call_args["max_depth"] == 5 + assert call_args["blocksize"] == 20 + assert call_args["blocknum"] == 1 + assert call_args["non_numeric"] == "not_a_number" + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_loader_exception(self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters): + """Test handling of SitemapLoader exceptions.""" + # Setup mock to raise exception + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.side_effect = Exception("Network error") + + # Execute and verify exception is raised + with pytest.raises(ValueError, match="Failed to load documents from Sitemap: Network error"): + await sitemap_extractor.aextract_content(sample_extraction_parameters) + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_empty_documents(self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters): + """Test extraction when SitemapLoader returns no documents.""" + # Setup mock to return empty list + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + result = await sitemap_extractor.aextract_content(sample_extraction_parameters) + + # Verify + assert result == [] + sitemap_extractor.mapper.map_document2informationpiece.assert_not_called() + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_minimal_parameters(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with minimal required parameters.""" + extraction_params = ExtractionParameters( + document_name="minimal_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml") + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_documents = [LangchainDocument(page_content="Minimal content", metadata={})] + mock_loader_instance.lazy_load.return_value = iter(mock_documents) + + # Execute + result = await sitemap_extractor.aextract_content(extraction_params) + + # Verify + assert len(result) == 1 + mock_sitemap_loader_class.assert_called_once_with(web_path="https://example.com/sitemap.xml") + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_complex_filter_urls(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with complex filter_urls JSON array.""" + extraction_params = ExtractionParameters( + document_name="complex_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="filter_urls", value='[".*\\\\.html$", ".*page[0-9]+.*", "https://example\\\\.com/special/.*"]') + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + await sitemap_extractor.aextract_content(extraction_params) + + # Verify complex JSON parsing + call_args = mock_sitemap_loader_class.call_args[1] + expected_patterns = [".*\\.html$", ".*page[0-9]+.*", "https://example\\.com/special/.*"] + assert call_args["filter_urls"] == expected_patterns + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_no_headers(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction without header_template parameter.""" + extraction_params = ExtractionParameters( + document_name="no_headers_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="max_depth", value="3") + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + await sitemap_extractor.aextract_content(extraction_params) + + # Verify no header_template in call args + call_args = mock_sitemap_loader_class.call_args[1] + assert "header_template" not in call_args + assert call_args["max_depth"] == 3 + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_with_real_langchain_documents(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with realistic LangChain Document objects.""" + extraction_params = ExtractionParameters( + document_name="realistic_doc", + source_type="sitemap", + kwargs=[KeyValuePair(key="web_path", value="https://example.com/sitemap.xml")] + ) + + # Create realistic documents + mock_documents = [ + LangchainDocument( + page_content="

Welcome to Example

This is the homepage content with useful information about our services.

", + metadata={ + "source": "https://example.com/", + "title": "Example Homepage", + "loc": "https://example.com/", + "lastmod": "2023-12-01", + "changefreq": "weekly", + "priority": "1.0" + } + ), + LangchainDocument( + page_content="

About Us

Learn more about our company history and mission.

", + metadata={ + "source": "https://example.com/about", + "title": "About Us - Example", + "loc": "https://example.com/about", + "lastmod": "2023-11-15" + } + ) + ] + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter(mock_documents) + + # Execute + result = await sitemap_extractor.aextract_content(extraction_params) + + # Verify + assert len(result) == 2 + assert sitemap_extractor.mapper.map_document2informationpiece.call_count == 2 + + # Verify mapper was called with correct arguments + for i, call in enumerate(sitemap_extractor.mapper.map_document2informationpiece.call_args_list): + args, kwargs = call + assert args[0] == mock_documents[i] + assert args[1] == "realistic_doc" + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.asyncio.get_event_loop') + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_executor_usage(self, mock_sitemap_loader_class, mock_get_event_loop, sitemap_extractor, sample_extraction_parameters): + """Test that content extraction uses executor for non-async sitemap loading.""" + # Setup mocks + mock_loop = MagicMock() + mock_get_event_loop.return_value = mock_loop + + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + + # Create a future that resolves to documents + mock_documents = [LangchainDocument(page_content="Test content", metadata={})] + future = asyncio.Future() + future.set_result(mock_documents) + mock_loop.run_in_executor.return_value = future + + # Execute + result = await sitemap_extractor.aextract_content(sample_extraction_parameters) + + # Verify executor was used + mock_loop.run_in_executor.assert_called_once() + executor_call_args = mock_loop.run_in_executor.call_args + assert executor_call_args[0][0] is None # First arg should be None (default executor) + assert callable(executor_call_args[0][1]) # Second arg should be a callable + + def test_extractor_inheritance(self, sitemap_extractor): + """Test that SitemapExtractor properly inherits from InformationExtractor.""" + from extractor_api_lib.extractors.information_extractor import InformationExtractor + assert isinstance(sitemap_extractor, InformationExtractor) + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_edge_case_empty_kwargs(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with empty kwargs list.""" + extraction_params = ExtractionParameters( + document_name="empty_kwargs_doc", + source_type="sitemap", + kwargs=[] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + result = await sitemap_extractor.aextract_content(extraction_params) + + # Verify + assert result == [] + # Should still call SitemapLoader but with no additional parameters + mock_sitemap_loader_class.assert_called_once_with() + + @pytest.mark.asyncio + @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_class, sitemap_extractor): + """Test extraction with mixed parameter types (strings, numbers, JSON).""" + extraction_params = ExtractionParameters( + document_name="mixed_doc", + source_type="sitemap", + kwargs=[ + KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), + KeyValuePair(key="max_depth", value="3"), # Will be converted to int + KeyValuePair(key="continue_on_failure", value="true"), # Will remain string + KeyValuePair(key="filter_urls", value='["pattern1", "pattern2"]'), # Will be parsed as JSON + KeyValuePair(key="header_template", value='{"Authorization": "Bearer token123"}'), # Will be parsed as JSON + KeyValuePair(key="custom_param", value="custom_value") # Will remain string + ] + ) + + # Setup mock + mock_loader_instance = MagicMock() + mock_sitemap_loader_class.return_value = mock_loader_instance + mock_loader_instance.lazy_load.return_value = iter([]) + + # Execute + await sitemap_extractor.aextract_content(extraction_params) + + # Verify parameter processing + call_args = mock_sitemap_loader_class.call_args[1] + assert call_args["web_path"] == "https://example.com/sitemap.xml" + assert call_args["max_depth"] == 3 # Converted to int + assert call_args["continue_on_failure"] == "true" # Remained string + assert call_args["filter_urls"] == ["pattern1", "pattern2"] # Parsed JSON + assert call_args["header_template"] == {"Authorization": "Bearer token123"} # Parsed JSON + assert call_args["custom_param"] == "custom_value" # Remained string From 3d13637e26b35d74493acff6b4652e8d9fb53936 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 5 Jun 2025 10:37:45 +0200 Subject: [PATCH 49/56] feat: enhance DependencyContainer and SitemapExtractor with custom parsing and meta functions --- .../extractor_api_lib/dependency_container.py | 18 ++++-- .../impl/extractors/confluence_extractor.py | 4 +- .../impl/extractors/sitemap_extractor.py | 23 ++++++- ...ce_langchain_document2information_piece.py | 41 ++---------- .../sitemap_document2information_piece.py | 54 ++++------------ .../impl/utils/sitemap_extractor_utils.py | 38 +++++++++++ .../src/extractor_api_lib/mapper/__init__.py | 0 ...ce_langchain_document2information_piece.py | 63 +++++++++++++++++++ .../tests/sitemap_extractor_test.py | 2 +- 9 files changed, 156 insertions(+), 87 deletions(-) create mode 100644 extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py create mode 100644 extractor-api-lib/src/extractor_api_lib/mapper/__init__.py create mode 100644 extractor-api-lib/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index bcbc1a8..6a83a42 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -1,7 +1,7 @@ """Module for dependency injection container for managing application dependencies.""" from dependency_injector.containers import DeclarativeContainer -from dependency_injector.providers import List, Singleton # noqa: WOT001 +from dependency_injector.providers import Factory, List, Singleton # noqa: WOT001 from extractor_api_lib.impl.api_endpoints.general_source_extractor import GeneralSourceExtractor from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor @@ -21,14 +21,18 @@ from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings from extractor_api_lib.impl.settings.s3_settings import S3Settings from extractor_api_lib.impl.table_converter.dataframe2markdown import DataFrame2Markdown +from extractor_api_lib.impl.utils.sitemap_extractor_utils import custom_sitemap_meta_function, custom_sitemap_parser_function class DependencyContainer(DeclarativeContainer): """Dependency injection container for managing application dependencies.""" # Settings - settings_s3 = Singleton(S3Settings) - settings_pdf_extractor = Singleton(PDFExtractorSettings) + settings_s3 = S3Settings() + settings_pdf_extractor = PDFExtractorSettings() + + sitemap_parsing_function = Factory(lambda: custom_sitemap_parser_function) + sitemap_meta_function = Factory(lambda: custom_sitemap_meta_function) database_converter = Singleton(DataFrame2Markdown) file_service = Singleton(S3Service, settings_s3) @@ -43,7 +47,13 @@ class DependencyContainer(DeclarativeContainer): general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external) confluence_extractor = Singleton(ConfluenceExtractor, mapper=langchain_document2information_piece) - sitemap_extractor = Singleton(SitemapExtractor, mapper=sitemap_document2information_piece) + + sitemap_extractor = Singleton( + SitemapExtractor, + mapper=sitemap_document2information_piece, + parsing_function=sitemap_parsing_function, + meta_function=sitemap_meta_function + ) source_extractor = Singleton( GeneralSourceExtractor, mapper=intern2external, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index f1c15a6..8694aa1 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -27,7 +27,7 @@ def __init__( An instance of ConfluenceLangchainDocument2InformationPiece used for mapping langchain documents to information pieces. """ - self.mapper = mapper + self._mapper = mapper @property def extractor_type(self) -> ExtractorTypes: @@ -59,4 +59,4 @@ async def aextract_content( confluence_loader_parameters.pop("document_name", None) document_loader = ConfluenceLoader(**confluence_loader_parameters) documents = document_loader.load() - return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] + return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py index d54aee4..8448740 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -1,5 +1,6 @@ """Module for the DefaultSitemapExtractor class.""" +from typing import Optional from langchain_community.document_loaders import SitemapLoader import asyncio import json @@ -19,6 +20,8 @@ class SitemapExtractor(InformationExtractor): def __init__( self, mapper: SitemapLangchainDocument2InformationPiece, + parsing_function: Optional[callable] = None, + meta_function: Optional[callable] = None, ): """ Initialize the SitemapExtractor. @@ -29,12 +32,19 @@ def __init__( An instance of SitemapLangchainDocument2InformationPiece used for mapping langchain documents to information pieces. """ - self.mapper = mapper + self._mapper = mapper + self._parsing_function = parsing_function + self._meta_function = meta_function @property def extractor_type(self) -> ExtractorTypes: return ExtractorTypes.SITEMAP + @property + def mapper(self) -> SitemapLangchainDocument2InformationPiece: + """Get the mapper instance.""" + return self._mapper + async def aextract_content( self, extraction_parameters: ExtractionParameters, @@ -74,6 +84,15 @@ async def aextract_content( if "document_name" in sitemap_loader_parameters: sitemap_loader_parameters.pop("document_name", None) + + # Only pass custom functions if they are provided + if self._parsing_function is not None: + # Get the actual function from the provider + sitemap_loader_parameters["parsing_function"] = self._parsing_function + if self._meta_function is not None: + # Get the actual function from the provider + sitemap_loader_parameters["meta_function"] = self._meta_function + document_loader = SitemapLoader(**sitemap_loader_parameters) documents = [] try: @@ -83,4 +102,4 @@ def load_documents(): documents = await asyncio.get_event_loop().run_in_executor(None, load_documents) except Exception as e: raise ValueError(f"Failed to load documents from Sitemap: {e}") - return [self.mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] + return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index a7bcb0d..0957687 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -2,11 +2,12 @@ from langchain_core.documents import Document as LangchainDocument +from extractor_api_lib.mapper.source_langchain_document2information_piece import SourceLangchainDocument2InformationPiece from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece from extractor_api_lib.models.content_type import ContentType -class ConfluenceLangchainDocument2InformationPiece: +class ConfluenceLangchainDocument2InformationPiece(SourceLangchainDocument2InformationPiece): """ A class to map a LangchainDocument to an InformationPiece with Confluence-specific metadata. @@ -14,9 +15,9 @@ class ConfluenceLangchainDocument2InformationPiece: ---------- USE_CASE_DOCUMENT_URL_KEY : str Key for the document URL in the use case. - CONFLUENCE_LOADER_SOURCE_URL_KEY : str + SOURCE_LOADER_SOURCE_URL_KEY : str Key for the source URL in the Confluence loader. - CONFLUENCE_LOADER_TITLE_KEY : str + SOURCE_LOADER_TITLE_KEY : str Key for the title in the Confluence loader. USER_CASE_PAGE_KEY : str Key for the page in the use case. @@ -26,43 +27,13 @@ class ConfluenceLangchainDocument2InformationPiece: Key for the document. """ - USE_CASE_DOCUMENT_URL_KEY = "document_url" - CONFLUENCE_LOADER_SOURCE_URL_KEY = "source" - CONFLUENCE_LOADER_TITLE_KEY = "title" - USER_CASE_PAGE_KEY = "page" - USE_CASE_RELATED_KEY = "related" - DOCUMENT_KEY = "document" - - def map_document2informationpiece( - self, document: LangchainDocument, document_name: str - ) -> InternalInformationPiece: - """ - Map a LangchainDocument to an InformationPiece. - - Parameters - ---------- - document : LangchainDocument - The document to be mapped. - - Returns - ------- - InformationPiece - The mapped information piece containing page content, type, and metadata. - - Raises - ------ - ValueError - If Confluence parameters are not set before mapping documents. - """ - meta = self._map_meta(document.metadata, document_name) - return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) def _map_meta(self, internal: dict, document_name: str) -> dict: metadata = {} for key, value in internal.items(): - metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.CONFLUENCE_LOADER_SOURCE_URL_KEY else key] = value + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.SOURCE_LOADER_SOURCE_URL_KEY else key] = value - page_title_matches = [v for k, v in metadata.items() if k == self.CONFLUENCE_LOADER_TITLE_KEY] + page_title_matches = [v for k, v in metadata.items() if k == self.SOURCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" metadata[self.USER_CASE_PAGE_KEY] = page_title diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py index 72d37c0..c239ce7 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py @@ -1,14 +1,10 @@ """Module for the SitemapLangchainDocument2InformationPiece class.""" - -import uuid -from langchain_core.documents import Document as LangchainDocument - from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece -from extractor_api_lib.models.content_type import ContentType +from extractor_api_lib.mapper.source_langchain_document2information_piece import SourceLangchainDocument2InformationPiece -class SitemapLangchainDocument2InformationPiece: + +class SitemapLangchainDocument2InformationPiece(SourceLangchainDocument2InformationPiece): """ A class to map a LangchainDocument to an InformationPiece with Sitemap-specific metadata. @@ -16,56 +12,28 @@ class SitemapLangchainDocument2InformationPiece: ---------- USE_CASE_DOCUMENT_URL_KEY : str Key for the document URL in the use case. - SITEMAP_LOADER_SOURCE_URL_KEY : str - Key for the source URL in the Sitemap loader. - SITEMAP_LOADER_TITLE_KEY : str - Key for the title in the Sitemap loader. + SOURCE_LOADER_SOURCE_URL_KEY : str + The key for the source URL in the Sitemap loader. + SOURCE_LOADER_TITLE_KEY : str + The key for the title in the Sitemap loader. USER_CASE_PAGE_KEY : str Key for the page in the use case. USE_CASE_RELATED_KEY : str Key for related information in the use case. DOCUMENT_KEY : str Key for the document. + ID_KEY : str + Key for the unique identifier of the information piece. """ - USE_CASE_DOCUMENT_URL_KEY = "document_url" - SITEMAP_LOADER_SOURCE_URL_KEY = "source" - SITEMAP_LOADER_TITLE_KEY = "title" - USER_CASE_PAGE_KEY = "page" - USE_CASE_RELATED_KEY = "related" - DOCUMENT_KEY = "document" ID_KEY = "id" - def map_document2informationpiece( - self, document: LangchainDocument, document_name: str - ) -> InternalInformationPiece: - """ - Map a LangchainDocument to an InformationPiece. - - Parameters - ---------- - document : LangchainDocument - The document to be mapped. - - Returns - ------- - InformationPiece - The mapped information piece containing page content, type, and metadata. - - Raises - ------ - ValueError - If Sitemap parameters are not set before mapping documents. - """ - meta = self._map_meta(document.metadata, document_name) - return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) - def _map_meta(self, internal: dict, document_name: str) -> dict: metadata = {} for key, value in internal.items(): - metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.SITEMAP_LOADER_SOURCE_URL_KEY else key] = value + metadata[self.USE_CASE_DOCUMENT_URL_KEY if key == self.SOURCE_LOADER_SOURCE_URL_KEY else key] = value - page_title_matches = [v for k, v in metadata.items() if k == self.SITEMAP_LOADER_TITLE_KEY] + page_title_matches = [v for k, v in metadata.items() if k == self.SOURCE_LOADER_TITLE_KEY] page_title = page_title_matches[0] if page_title_matches else "Unknown Title" metadata[self.USER_CASE_PAGE_KEY] = page_title diff --git a/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py b/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py new file mode 100644 index 0000000..712976b --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py @@ -0,0 +1,38 @@ +from bs4 import BeautifulSoup +from typing import Any, Union + + +def custom_sitemap_parser_function(content: Union[str, BeautifulSoup]) -> str: + """ + Given HTML content (as a string or BeautifulSoup object), return only the + concatenated text from all
elements. + """ + if isinstance(content, str): + soup = BeautifulSoup(content, "html.parser") + else: + soup = content + + article_elements = soup.find_all("article") + if not article_elements: + return str(content.get_text()) + + texts = [element.get_text(separator=" ", strip=True) for element in article_elements] + return "\n".join(texts) + +def custom_sitemap_meta_function(meta: dict, _content: Any) -> dict: + """ + Given metadata and HTML content, extract the title from the first

element + """ + if isinstance(_content, str): + soup = BeautifulSoup(_content, "html.parser") + else: + soup = _content + + article_elements = soup.find_all("article") + if not article_elements: + return {"source": meta["loc"], **meta} + + # Find h1 elements within the first article element + h1_elements = article_elements[0].find_all("h1") + meta["title"] = h1_elements[0].get_text(strip=True) if h1_elements else "Unknown Title" + return {"source": meta["loc"], **meta} diff --git a/extractor-api-lib/src/extractor_api_lib/mapper/__init__.py b/extractor-api-lib/src/extractor_api_lib/mapper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/extractor-api-lib/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py new file mode 100644 index 0000000..e850581 --- /dev/null +++ b/extractor-api-lib/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py @@ -0,0 +1,63 @@ +"""Module for the ConfluenceLangchainDocument2InformationPiece class.""" + +from abc import abstractmethod, ABC +from langchain_core.documents import Document as LangchainDocument + +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.models.content_type import ContentType + + +class SourceLangchainDocument2InformationPiece(ABC): + """ + A class to map a LangchainDocument to an InformationPiece with Confluence-specific metadata. + + Attributes + ---------- + USE_CASE_DOCUMENT_URL_KEY : str + Key for the document URL in the use case. + CONFLUENCE_LOADER_SOURCE_URL_KEY : str + Key for the source URL in the Confluence loader. + CONFLUENCE_LOADER_TITLE_KEY : str + Key for the title in the Confluence loader. + USER_CASE_PAGE_KEY : str + Key for the page in the use case. + USE_CASE_RELATED_KEY : str + Key for related information in the use case. + DOCUMENT_KEY : str + Key for the document. + """ + + USE_CASE_DOCUMENT_URL_KEY = "document_url" + SOURCE_LOADER_SOURCE_URL_KEY = "source" + SOURCE_LOADER_TITLE_KEY = "title" + USER_CASE_PAGE_KEY = "page" + USE_CASE_RELATED_KEY = "related" + DOCUMENT_KEY = "document" + + def map_document2informationpiece( + self, document: LangchainDocument, document_name: str + ) -> InternalInformationPiece: + """ + Map a LangchainDocument to an InformationPiece. + + Parameters + ---------- + document : LangchainDocument + The document to be mapped. + + Returns + ------- + InformationPiece + The mapped information piece containing page content, type, and metadata. + + Raises + ------ + ValueError + If Confluence parameters are not set before mapping documents. + """ + meta = self._map_meta(document.metadata, document_name) + return InternalInformationPiece(page_content=document.page_content, type=ContentType.TEXT, metadata=meta) + + @abstractmethod + def _map_meta(self, internal: dict, document_name: str) -> dict: + raise NotImplementedError("Subclasses must implement this method.") diff --git a/extractor-api-lib/tests/sitemap_extractor_test.py b/extractor-api-lib/tests/sitemap_extractor_test.py index 1782358..04921be 100644 --- a/extractor-api-lib/tests/sitemap_extractor_test.py +++ b/extractor-api-lib/tests/sitemap_extractor_test.py @@ -54,7 +54,7 @@ def sample_extraction_parameters(self): def test_init(self, mock_mapper): """Test SitemapExtractor initialization.""" extractor = SitemapExtractor(mapper=mock_mapper) - assert extractor.mapper == mock_mapper + assert extractor._mapper == mock_mapper def test_extractor_type(self, sitemap_extractor): """Test that extractor_type returns SITEMAP.""" From 4674a67da884ffc8a6c077f4f231e7df9eff3fc6 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 5 Jun 2025 11:01:56 +0200 Subject: [PATCH 50/56] feat: enhance SitemapExtractor with improved parameter handling and custom parsing functions --- README.md | 20 ++- extractor-api-lib/pyproject.toml | 2 +- .../extractor_api_lib/dependency_container.py | 7 +- .../impl/extractors/sitemap_extractor.py | 51 ++++--- ...ce_langchain_document2information_piece.py | 9 +- .../sitemap_document2information_piece.py | 6 +- .../impl/utils/sitemap_extractor_utils.py | 1 + .../tests/sitemap_extractor_test.py | 132 +++++++++--------- 8 files changed, 130 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 38a9349..d9a9e38 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # RAG Core library This repository contains the core of the STACKIT RAG template. +It provides comprehensive document extraction capabilities including support for files (PDF, DOCX, XML), web sources via sitemaps, and Confluence pages. It consists of the following python packages: - [`1. Rag Core API`](#1-rag-core-api) @@ -143,7 +144,7 @@ The extracted information will be summarized using a LLM. The summary, as well a #### `/upload_source` Loads all the content from an arbitrary non-file source using the [document-extractor](#3-extractor-api-lib). -The `type`of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). +The `type` of the source needs to correspond to an extractor in the [document-extractor](#3-extractor-api-lib). Supported types include `confluence` for Confluence pages and `sitemap` for web content via XML sitemaps. The extracted information will be summarized using LLM. The summary, as well as the unrefined extracted document, will be uploaded to the [rag-core-api](#1-rag-core-api). An is configured. Defaults to 3600 seconds (1 hour). Can be adjusted by values in the helm chart. ### 2.3 Replaceable parts @@ -169,8 +170,7 @@ The extracted information will be summarized using LLM. The summary, as well as ## 3. Extractor API Lib -The Extractor Library contains components that provide document parsing capabilities for various file formats. It also includes a default `dependency_container`, that is pre-configured and is a good starting point for most use-cases. -This API should not be exposed by ingress and only used for internally. +The Extractor Library contains components that provide document parsing capabilities for various file formats and web sources. It supports extracting content from PDF, DOCX, XML files, as well as web pages via sitemaps and Confluence pages. It also includes a default `dependency_container`, that is pre-configured and is a good starting point for most use-cases. This API should not be exposed by ingress and only used for internally. The following endpoints are provided by the *extractor-api-lib*: @@ -206,12 +206,21 @@ The following types of information will be extracted: #### `/extract_from_source` This endpoint will extract data for non-file source. -The type of information that is extracted will vary depending on the source, the following types of information can be extracted: +The type of information that is extracted will vary depending on the source. Supported sources include `confluence` for Confluence pages and `sitemap` for web pages via XML sitemaps. +The following types of information can be extracted: - `TEXT`: plain text - `TABLE`: data in tabular form found in the document - `IMAGE`: image found in the document +For sitemap sources, additional parameters can be provided, e.g.: +- `web_path`: The URL of the XML sitemap to crawl +- `filter_urls`: JSON array of URL patterns to filter pages (optional) +- `header_template`: JSON object for custom HTTP headers (optional) + +Technically, all parameters of the `SitemapLoader` from LangChain can be provided. + + ### 3.3 Replaceable parts | Name | Type | Default | Notes | @@ -226,6 +235,9 @@ The type of information that is extracted will vary depending on the source, the | file_extractor | [`extractor_api_lib.api_endpoints.file_extractor.FileExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/file_extractor.py) | [`extractor_api_lib.impl.api_endpoints.default_file_extractor.DefaultFileExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/default_file_extractor.py) | Implementation of the `/extract_from_file` endpoint. Uses *general_extractor*. | | general_source_extractor | [`extractor_api_lib.api_endpoints.source_extractor.SourceExtractor`](./extractor-api-lib/src/extractor_api_lib/api_endpoints/source_extractor.py) | [`extractor_api_lib.impl.api_endpoints.general_source_extractor.GeneralSourceExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py) | Implementation of the `/extract_from_source` endpoint. Will decide the correct extractor for the source. | | confluence_extractor | [`extractor_api_lib.extractors.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py) | [`extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/confluence_extractor.py) | Implementation of an esxtractor for the source `confluence`. | +| sitemap_extractor | [`extractor_api_lib.extractors.information_extractor.InformationExtractor`](./extractor-api-lib/src/extractor_api_lib/extractors/information_extractor.py) | [`extractor_api_lib.impl.extractors.sitemap_extractor.SitemapExtractor`](./extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py) | Implementation of an extractor for the source `sitemap`. Supports XML sitemap crawling with configurable parameters including URL filtering, custom headers, and crawling depth. Uses LangChain's SitemapLoader with support for custom parsing and meta functions via dependency injection. | +| sitemap_parsing_function | `dependency_injector.providers.Factory[Callable]` | [`extractor_api_lib.impl.utils.sitemap_extractor_utils.custom_sitemap_parser_function`](./extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py) | Custom parsing function for sitemap content extraction. Used by the sitemap extractor to parse HTML content from web pages. Can be replaced to customize how web page content is processed and extracted. | +| sitemap_meta_function | `dependency_injector.providers.Factory[Callable]` | [`extractor_api_lib.impl.utils.sitemap_extractor_utils.custom_sitemap_meta_function`](./extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py) | Custom meta function for sitemap content processing. Used by the sitemap extractor to extract metadata from web pages. Can be replaced to customize how metadata is extracted and structured from web content. | ## 4. RAG Core Lib diff --git a/extractor-api-lib/pyproject.toml b/extractor-api-lib/pyproject.toml index 814e68a..406f42b 100644 --- a/extractor-api-lib/pyproject.toml +++ b/extractor-api-lib/pyproject.toml @@ -28,7 +28,7 @@ per-file-ignores = """ ./src/extractor_api_lib/impl/extractor_api_impl.py: B008, ./src/extractor_api_lib/container.py: CCE002,CCE001, ./src/extractor_api_lib/apis/extractor_api_base.py: WOT001, - ./tests/*: S101, + ./tests/*: S101,E501, """ [tool.black] diff --git a/extractor-api-lib/src/extractor_api_lib/dependency_container.py b/extractor-api-lib/src/extractor_api_lib/dependency_container.py index 6a83a42..628991a 100644 --- a/extractor-api-lib/src/extractor_api_lib/dependency_container.py +++ b/extractor-api-lib/src/extractor_api_lib/dependency_container.py @@ -21,7 +21,10 @@ from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings from extractor_api_lib.impl.settings.s3_settings import S3Settings from extractor_api_lib.impl.table_converter.dataframe2markdown import DataFrame2Markdown -from extractor_api_lib.impl.utils.sitemap_extractor_utils import custom_sitemap_meta_function, custom_sitemap_parser_function +from extractor_api_lib.impl.utils.sitemap_extractor_utils import ( + custom_sitemap_meta_function, + custom_sitemap_parser_function, +) class DependencyContainer(DeclarativeContainer): @@ -52,7 +55,7 @@ class DependencyContainer(DeclarativeContainer): SitemapExtractor, mapper=sitemap_document2information_piece, parsing_function=sitemap_parsing_function, - meta_function=sitemap_meta_function + meta_function=sitemap_meta_function, ) source_extractor = Singleton( GeneralSourceExtractor, diff --git a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py index 8448740..46dd292 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/extractors/sitemap_extractor.py @@ -62,25 +62,7 @@ async def aextract_content( list[InternalInformationPiece] A list of information pieces extracted from Sitemap. """ - sitemap_loader_parameters = {} - headers = None - - for x in extraction_parameters.kwargs: - if x.key == "header_template": - try: - headers = json.loads(x.value) - except (json.JSONDecodeError, TypeError): - headers = x.value if isinstance(x.value, dict) else None - elif x.key == "filter_urls": - try: - sitemap_loader_parameters[x.key] = json.loads(x.value) - except (json.JSONDecodeError, TypeError): - sitemap_loader_parameters[x.key] = x.value - else: - sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value - - if headers: - sitemap_loader_parameters["header_template"] = headers + sitemap_loader_parameters = self._parse_sitemap_loader_parameters(extraction_parameters) if "document_name" in sitemap_loader_parameters: sitemap_loader_parameters.pop("document_name", None) @@ -96,6 +78,7 @@ async def aextract_content( document_loader = SitemapLoader(**sitemap_loader_parameters) documents = [] try: + def load_documents(): return list(document_loader.lazy_load()) @@ -103,3 +86,33 @@ def load_documents(): except Exception as e: raise ValueError(f"Failed to load documents from Sitemap: {e}") return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] + + def _parse_sitemap_loader_parameters(self, extraction_parameters: ExtractionParameters) -> dict: + """ + Parse the extraction parameters to extract sitemap loader parameters. + + Parameters + ---------- + extraction_parameters : ExtractionParameters + The parameters required to connect to and extract data from Sitemap. + + Returns + ------- + dict + A dictionary containing the parsed sitemap loader parameters. + """ + sitemap_loader_parameters = {} + for x in extraction_parameters.kwargs: + if x.key == "header_template": + try: + sitemap_loader_parameters[x.key] = json.loads(x.value) + except (json.JSONDecodeError, TypeError): + sitemap_loader_parameters[x.key] = x.value if isinstance(x.value, dict) else None + elif x.key == "filter_urls": + try: + sitemap_loader_parameters[x.key] = json.loads(x.value) + except (json.JSONDecodeError, TypeError): + sitemap_loader_parameters[x.key] = x.value + else: + sitemap_loader_parameters[x.key] = int(x.value) if x.value.isdigit() else x.value + return sitemap_loader_parameters diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py index 0957687..13a01a7 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py @@ -1,10 +1,8 @@ """Module for the ConfluenceLangchainDocument2InformationPiece class.""" -from langchain_core.documents import Document as LangchainDocument - -from extractor_api_lib.mapper.source_langchain_document2information_piece import SourceLangchainDocument2InformationPiece -from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece -from extractor_api_lib.models.content_type import ContentType +from extractor_api_lib.mapper.source_langchain_document2information_piece import ( + SourceLangchainDocument2InformationPiece, +) class ConfluenceLangchainDocument2InformationPiece(SourceLangchainDocument2InformationPiece): @@ -27,7 +25,6 @@ class ConfluenceLangchainDocument2InformationPiece(SourceLangchainDocument2Infor Key for the document. """ - def _map_meta(self, internal: dict, document_name: str) -> dict: metadata = {} for key, value in internal.items(): diff --git a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py index c239ce7..815b3fa 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py @@ -1,7 +1,9 @@ """Module for the SitemapLangchainDocument2InformationPiece class.""" -from extractor_api_lib.impl.utils.utils import hash_datetime -from extractor_api_lib.mapper.source_langchain_document2information_piece import SourceLangchainDocument2InformationPiece +from extractor_api_lib.impl.utils.utils import hash_datetime +from extractor_api_lib.mapper.source_langchain_document2information_piece import ( + SourceLangchainDocument2InformationPiece, +) class SitemapLangchainDocument2InformationPiece(SourceLangchainDocument2InformationPiece): diff --git a/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py b/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py index 712976b..041569f 100644 --- a/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py +++ b/extractor-api-lib/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py @@ -19,6 +19,7 @@ def custom_sitemap_parser_function(content: Union[str, BeautifulSoup]) -> str: texts = [element.get_text(separator=" ", strip=True) for element in article_elements] return "\n".join(texts) + def custom_sitemap_meta_function(meta: dict, _content: Any) -> dict: """ Given metadata and HTML content, extract the title from the first

element diff --git a/extractor-api-lib/tests/sitemap_extractor_test.py b/extractor-api-lib/tests/sitemap_extractor_test.py index 04921be..ff31e0f 100644 --- a/extractor-api-lib/tests/sitemap_extractor_test.py +++ b/extractor-api-lib/tests/sitemap_extractor_test.py @@ -1,9 +1,8 @@ """Comprehensive test suite for SitemapExtractor class.""" import asyncio -import json import pytest -from unittest.mock import AsyncMock, MagicMock, patch, Mock +from unittest.mock import MagicMock, patch from langchain_core.documents import Document as LangchainDocument from extractor_api_lib.impl.extractors.sitemap_extractor import SitemapExtractor @@ -27,7 +26,7 @@ def mock_mapper(self): mapper.map_document2informationpiece.return_value = InternalInformationPiece( type=ContentType.TEXT, metadata={"document": "test_doc", "id": "test_id", "related": []}, - page_content="Test content" + page_content="Test content", ) return mapper @@ -47,8 +46,8 @@ def sample_extraction_parameters(self): KeyValuePair(key="filter_urls", value='["https://example.com/page1", "https://example.com/page2"]'), KeyValuePair(key="header_template", value='{"User-Agent": "test-agent"}'), KeyValuePair(key="max_depth", value="2"), - KeyValuePair(key="blocksize", value="10") - ] + KeyValuePair(key="blocksize", value="10"), + ], ) def test_init(self, mock_mapper): @@ -61,8 +60,10 @@ def test_extractor_type(self, sitemap_extractor): assert sitemap_extractor.extractor_type == ExtractorTypes.SITEMAP @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') - async def test_aextract_content_basic(self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters): + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") + async def test_aextract_content_basic( + self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters + ): """Test basic content extraction functionality.""" # Setup mock SitemapLoader mock_loader_instance = MagicMock() @@ -71,13 +72,11 @@ async def test_aextract_content_basic(self, mock_sitemap_loader_class, sitemap_e # Create mock documents mock_documents = [ LangchainDocument( - page_content="Content from page 1", - metadata={"source": "https://example.com/page1", "title": "Page 1"} + page_content="Content from page 1", metadata={"source": "https://example.com/page1", "title": "Page 1"} ), LangchainDocument( - page_content="Content from page 2", - metadata={"source": "https://example.com/page2", "title": "Page 2"} - ) + page_content="Content from page 2", metadata={"source": "https://example.com/page2", "title": "Page 2"} + ), ] mock_loader_instance.lazy_load.return_value = iter(mock_documents) @@ -87,13 +86,13 @@ async def test_aextract_content_basic(self, mock_sitemap_loader_class, sitemap_e InternalInformationPiece( type=ContentType.TEXT, metadata={"document": "test_sitemap_doc", "id": "id1", "related": []}, - page_content="Content from page 1" + page_content="Content from page 1", ), InternalInformationPiece( type=ContentType.TEXT, metadata={"document": "test_sitemap_doc", "id": "id2", "related": []}, - page_content="Content from page 2" - ) + page_content="Content from page 2", + ), ] sitemap_extractor.mapper.map_document2informationpiece.side_effect = expected_info_pieces @@ -119,7 +118,7 @@ async def test_aextract_content_basic(self, mock_sitemap_loader_class, sitemap_e assert sitemap_extractor.mapper.map_document2informationpiece.call_count == 2 @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_json_parsing_failure(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with invalid JSON in parameters falls back to string values.""" # Create parameters with invalid JSON @@ -129,8 +128,8 @@ async def test_aextract_content_json_parsing_failure(self, mock_sitemap_loader_c kwargs=[ KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), KeyValuePair(key="filter_urls", value="invalid-json["), - KeyValuePair(key="header_template", value="invalid-json{") - ] + KeyValuePair(key="header_template", value="invalid-json{"), + ], ) # Setup mock @@ -150,7 +149,7 @@ async def test_aextract_content_json_parsing_failure(self, mock_sitemap_loader_c assert "header_template" not in call_args # Should not be set due to invalid JSON @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_header_template_dict_value(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction when header_template is already a dict.""" extraction_params = ExtractionParameters( @@ -158,8 +157,8 @@ async def test_aextract_content_header_template_dict_value(self, mock_sitemap_lo source_type="sitemap", kwargs=[ KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), - KeyValuePair(key="header_template", value={"User-Agent": "direct-dict"}) - ] + KeyValuePair(key="header_template", value={"User-Agent": "direct-dict"}), + ], ) # Setup mock @@ -168,14 +167,14 @@ async def test_aextract_content_header_template_dict_value(self, mock_sitemap_lo mock_loader_instance.lazy_load.return_value = iter([]) # Execute - result = await sitemap_extractor.aextract_content(extraction_params) + _ = await sitemap_extractor.aextract_content(extraction_params) # Verify call_args = mock_sitemap_loader_class.call_args[1] assert call_args["header_template"] == {"User-Agent": "direct-dict"} @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_document_name_removed(self, mock_sitemap_loader_class, sitemap_extractor): """Test that document_name parameter is removed from SitemapLoader parameters.""" extraction_params = ExtractionParameters( @@ -183,8 +182,8 @@ async def test_aextract_content_document_name_removed(self, mock_sitemap_loader_ source_type="sitemap", kwargs=[ KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), - KeyValuePair(key="document_name", value="should_be_removed") - ] + KeyValuePair(key="document_name", value="should_be_removed"), + ], ) # Setup mock @@ -200,7 +199,7 @@ async def test_aextract_content_document_name_removed(self, mock_sitemap_loader_ assert "document_name" not in call_args @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_numeric_parameters(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with numeric string parameters.""" extraction_params = ExtractionParameters( @@ -211,8 +210,8 @@ async def test_aextract_content_numeric_parameters(self, mock_sitemap_loader_cla KeyValuePair(key="max_depth", value="5"), KeyValuePair(key="blocksize", value="20"), KeyValuePair(key="blocknum", value="1"), - KeyValuePair(key="non_numeric", value="not_a_number") - ] + KeyValuePair(key="non_numeric", value="not_a_number"), + ], ) # Setup mock @@ -231,8 +230,10 @@ async def test_aextract_content_numeric_parameters(self, mock_sitemap_loader_cla assert call_args["non_numeric"] == "not_a_number" @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') - async def test_aextract_content_loader_exception(self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters): + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") + async def test_aextract_content_loader_exception( + self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters + ): """Test handling of SitemapLoader exceptions.""" # Setup mock to raise exception mock_loader_instance = MagicMock() @@ -244,8 +245,10 @@ async def test_aextract_content_loader_exception(self, mock_sitemap_loader_class await sitemap_extractor.aextract_content(sample_extraction_parameters) @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') - async def test_aextract_content_empty_documents(self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters): + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") + async def test_aextract_content_empty_documents( + self, mock_sitemap_loader_class, sitemap_extractor, sample_extraction_parameters + ): """Test extraction when SitemapLoader returns no documents.""" # Setup mock to return empty list mock_loader_instance = MagicMock() @@ -260,15 +263,13 @@ async def test_aextract_content_empty_documents(self, mock_sitemap_loader_class, sitemap_extractor.mapper.map_document2informationpiece.assert_not_called() @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_minimal_parameters(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with minimal required parameters.""" extraction_params = ExtractionParameters( document_name="minimal_doc", source_type="sitemap", - kwargs=[ - KeyValuePair(key="web_path", value="https://example.com/sitemap.xml") - ] + kwargs=[KeyValuePair(key="web_path", value="https://example.com/sitemap.xml")], ) # Setup mock @@ -285,7 +286,7 @@ async def test_aextract_content_minimal_parameters(self, mock_sitemap_loader_cla mock_sitemap_loader_class.assert_called_once_with(web_path="https://example.com/sitemap.xml") @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_complex_filter_urls(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with complex filter_urls JSON array.""" extraction_params = ExtractionParameters( @@ -293,8 +294,10 @@ async def test_aextract_content_complex_filter_urls(self, mock_sitemap_loader_cl source_type="sitemap", kwargs=[ KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), - KeyValuePair(key="filter_urls", value='[".*\\\\.html$", ".*page[0-9]+.*", "https://example\\\\.com/special/.*"]') - ] + KeyValuePair( + key="filter_urls", value='[".*\\\\.html$", ".*page[0-9]+.*", "https://example\\\\.com/special/.*"]' + ), + ], ) # Setup mock @@ -311,7 +314,7 @@ async def test_aextract_content_complex_filter_urls(self, mock_sitemap_loader_cl assert call_args["filter_urls"] == expected_patterns @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_no_headers(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction without header_template parameter.""" extraction_params = ExtractionParameters( @@ -319,8 +322,8 @@ async def test_aextract_content_no_headers(self, mock_sitemap_loader_class, site source_type="sitemap", kwargs=[ KeyValuePair(key="web_path", value="https://example.com/sitemap.xml"), - KeyValuePair(key="max_depth", value="3") - ] + KeyValuePair(key="max_depth", value="3"), + ], ) # Setup mock @@ -337,27 +340,27 @@ async def test_aextract_content_no_headers(self, mock_sitemap_loader_class, site assert call_args["max_depth"] == 3 @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_with_real_langchain_documents(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with realistic LangChain Document objects.""" extraction_params = ExtractionParameters( document_name="realistic_doc", source_type="sitemap", - kwargs=[KeyValuePair(key="web_path", value="https://example.com/sitemap.xml")] + kwargs=[KeyValuePair(key="web_path", value="https://example.com/sitemap.xml")], ) # Create realistic documents mock_documents = [ LangchainDocument( - page_content="

Welcome to Example

This is the homepage content with useful information about our services.

", + page_content="""

Welcome to Example

This is the homepage content with useful information about our services.

""", metadata={ "source": "https://example.com/", "title": "Example Homepage", "loc": "https://example.com/", "lastmod": "2023-12-01", "changefreq": "weekly", - "priority": "1.0" - } + "priority": "1.0", + }, ), LangchainDocument( page_content="

About Us

Learn more about our company history and mission.

", @@ -365,9 +368,9 @@ async def test_aextract_content_with_real_langchain_documents(self, mock_sitemap "source": "https://example.com/about", "title": "About Us - Example", "loc": "https://example.com/about", - "lastmod": "2023-11-15" - } - ) + "lastmod": "2023-11-15", + }, + ), ] # Setup mock @@ -389,9 +392,11 @@ async def test_aextract_content_with_real_langchain_documents(self, mock_sitemap assert args[1] == "realistic_doc" @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.asyncio.get_event_loop') - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') - async def test_aextract_content_executor_usage(self, mock_sitemap_loader_class, mock_get_event_loop, sitemap_extractor, sample_extraction_parameters): + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.asyncio.get_event_loop") + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") + async def test_aextract_content_executor_usage( + self, mock_sitemap_loader_class, mock_get_event_loop, sitemap_extractor, sample_extraction_parameters + ): """Test that content extraction uses executor for non-async sitemap loading.""" # Setup mocks mock_loop = MagicMock() @@ -407,7 +412,7 @@ async def test_aextract_content_executor_usage(self, mock_sitemap_loader_class, mock_loop.run_in_executor.return_value = future # Execute - result = await sitemap_extractor.aextract_content(sample_extraction_parameters) + _ = await sitemap_extractor.aextract_content(sample_extraction_parameters) # Verify executor was used mock_loop.run_in_executor.assert_called_once() @@ -418,17 +423,14 @@ async def test_aextract_content_executor_usage(self, mock_sitemap_loader_class, def test_extractor_inheritance(self, sitemap_extractor): """Test that SitemapExtractor properly inherits from InformationExtractor.""" from extractor_api_lib.extractors.information_extractor import InformationExtractor + assert isinstance(sitemap_extractor, InformationExtractor) @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_edge_case_empty_kwargs(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with empty kwargs list.""" - extraction_params = ExtractionParameters( - document_name="empty_kwargs_doc", - source_type="sitemap", - kwargs=[] - ) + extraction_params = ExtractionParameters(document_name="empty_kwargs_doc", source_type="sitemap", kwargs=[]) # Setup mock mock_loader_instance = MagicMock() @@ -444,7 +446,7 @@ async def test_aextract_content_edge_case_empty_kwargs(self, mock_sitemap_loader mock_sitemap_loader_class.assert_called_once_with() @pytest.mark.asyncio - @patch('extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader') + @patch("extractor_api_lib.impl.extractors.sitemap_extractor.SitemapLoader") async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_class, sitemap_extractor): """Test extraction with mixed parameter types (strings, numbers, JSON).""" extraction_params = ExtractionParameters( @@ -455,9 +457,11 @@ async def test_aextract_content_mixed_parameter_types(self, mock_sitemap_loader_ KeyValuePair(key="max_depth", value="3"), # Will be converted to int KeyValuePair(key="continue_on_failure", value="true"), # Will remain string KeyValuePair(key="filter_urls", value='["pattern1", "pattern2"]'), # Will be parsed as JSON - KeyValuePair(key="header_template", value='{"Authorization": "Bearer token123"}'), # Will be parsed as JSON - KeyValuePair(key="custom_param", value="custom_value") # Will remain string - ] + KeyValuePair( + key="header_template", value='{"Authorization": "Bearer token123"}' + ), # Will be parsed as JSON + KeyValuePair(key="custom_param", value="custom_value"), # Will remain string + ], ) # Setup mock From 0c079868f0c562aad03b8e6be50b6861881950ea Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 5 Jun 2025 11:08:54 +0200 Subject: [PATCH 51/56] feat: add settings parameter to DefaultSourceUploader tests for improved configurability --- .../tests/default_source_uploader_test.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index 9c47416..fdc8532 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -23,12 +23,13 @@ def mocks(): document_deleter.adelete_document = AsyncMock() rag_api = MagicMock() information_mapper = MagicMock() - return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + settings = MagicMock() + return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings @pytest.mark.asyncio async def test_handle_source_upload_success(mocks): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks # Setup mocks dummy_piece = MagicMock() extractor_api.extract_from_source.return_value = [dummy_piece] @@ -47,6 +48,7 @@ async def test_handle_source_upload_success(mocks): document_deleter, rag_api, information_mapper, + settings=settings ) await uploader._handle_source_upload("source1", "type1", []) @@ -58,7 +60,7 @@ async def test_handle_source_upload_success(mocks): @pytest.mark.asyncio async def test_handle_source_upload_no_info_pieces(mocks): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks extractor_api.extract_from_source.return_value = [] uploader = DefaultSourceUploader( @@ -69,6 +71,7 @@ async def test_handle_source_upload_no_info_pieces(mocks): document_deleter, rag_api, information_mapper, + settings=settings, ) await uploader._handle_source_upload("source2", "type2", []) @@ -79,13 +82,13 @@ async def test_handle_source_upload_no_info_pieces(mocks): @pytest.mark.asyncio async def test_upload_source_already_processing_raises_error(mocks): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks source_type = "typeX" name = "Doc Name" source_name = f"{source_type}:{sanitize_document_name(name)}" key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings ) with pytest.raises(HTTPException): # use default timeout @@ -95,7 +98,7 @@ async def test_upload_source_already_processing_raises_error(mocks): @pytest.mark.asyncio async def test_upload_source_no_timeout(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks key_value_store.get_all.return_value = [] source_type = "typeZ" name = "quick" @@ -103,10 +106,11 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): dummy_thread = MagicMock() monkeypatch.setattr(default_source_uploader, "Thread", lambda *args, **kwargs: dummy_thread) uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings ) # should not raise - await uploader.upload_source(source_type, name, [], timeout=1.0) + settings.timeout = 1.0 + await uploader.upload_source(source_type, name, []) # only PROCESSING status upserted, no ERROR assert any(call.args[1] == Status.PROCESSING for call in key_value_store.upsert.call_args_list) assert not any(call.args[1] == Status.ERROR for call in key_value_store.upsert.call_args_list) @@ -115,7 +119,7 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): @pytest.mark.asyncio async def test_upload_source_timeout_error(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper = mocks + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks key_value_store.get_all.return_value = [] source_type = "typeTimeout" name = "slow" @@ -141,11 +145,11 @@ def is_alive(self): monkeypatch.setattr(default_source_uploader, "Thread", FakeThread) uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper + extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings ) # no exception should be raised; timeout path sets ERROR status - - await uploader.upload_source(source_type, name, [], timeout=1.0) + settings.timeout = 1.0 + await uploader.upload_source(source_type, name, []) # first call marks PROCESSING, second marks ERROR calls = [call.args for call in key_value_store.upsert.call_args_list] assert (source_name, Status.PROCESSING) in calls From dad06875d21c9981bb63062f381e02b96d444a2b Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 5 Jun 2025 11:13:28 +0200 Subject: [PATCH 52/56] refactor: improve readability of mocks setup in DefaultSourceUploader tests --- .../tests/default_source_uploader_test.py | 95 +++++++++++++++++-- 1 file changed, 85 insertions(+), 10 deletions(-) diff --git a/admin-api-lib/tests/default_source_uploader_test.py b/admin-api-lib/tests/default_source_uploader_test.py index fdc8532..9146596 100644 --- a/admin-api-lib/tests/default_source_uploader_test.py +++ b/admin-api-lib/tests/default_source_uploader_test.py @@ -24,12 +24,30 @@ def mocks(): rag_api = MagicMock() information_mapper = MagicMock() settings = MagicMock() - return extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings + return ( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, + ) @pytest.mark.asyncio async def test_handle_source_upload_success(mocks): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks + ( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, + ) = mocks # Setup mocks dummy_piece = MagicMock() extractor_api.extract_from_source.return_value = [dummy_piece] @@ -48,7 +66,7 @@ async def test_handle_source_upload_success(mocks): document_deleter, rag_api, information_mapper, - settings=settings + settings=settings, ) await uploader._handle_source_upload("source1", "type1", []) @@ -60,7 +78,16 @@ async def test_handle_source_upload_success(mocks): @pytest.mark.asyncio async def test_handle_source_upload_no_info_pieces(mocks): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks + ( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, + ) = mocks extractor_api.extract_from_source.return_value = [] uploader = DefaultSourceUploader( @@ -82,13 +109,29 @@ async def test_handle_source_upload_no_info_pieces(mocks): @pytest.mark.asyncio async def test_upload_source_already_processing_raises_error(mocks): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks + ( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, + ) = mocks source_type = "typeX" name = "Doc Name" source_name = f"{source_type}:{sanitize_document_name(name)}" key_value_store.get_all.return_value = [(source_name, Status.PROCESSING)] uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, ) with pytest.raises(HTTPException): # use default timeout @@ -98,7 +141,16 @@ async def test_upload_source_already_processing_raises_error(mocks): @pytest.mark.asyncio async def test_upload_source_no_timeout(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks + ( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, + ) = mocks key_value_store.get_all.return_value = [] source_type = "typeZ" name = "quick" @@ -106,7 +158,14 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): dummy_thread = MagicMock() monkeypatch.setattr(default_source_uploader, "Thread", lambda *args, **kwargs: dummy_thread) uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, ) # should not raise settings.timeout = 1.0 @@ -119,7 +178,16 @@ async def test_upload_source_no_timeout(mocks, monkeypatch): @pytest.mark.asyncio async def test_upload_source_timeout_error(mocks, monkeypatch): - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings = mocks + ( + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, + ) = mocks key_value_store.get_all.return_value = [] source_type = "typeTimeout" name = "slow" @@ -145,7 +213,14 @@ def is_alive(self): monkeypatch.setattr(default_source_uploader, "Thread", FakeThread) uploader = DefaultSourceUploader( - extractor_api, key_value_store, information_enhancer, chunker, document_deleter, rag_api, information_mapper, settings + extractor_api, + key_value_store, + information_enhancer, + chunker, + document_deleter, + rag_api, + information_mapper, + settings, ) # no exception should be raised; timeout path sets ERROR status settings.timeout = 1.0 From c88430f899dcac4f9f94f6cdafe634eee2eaec78 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 6 Jun 2025 11:10:19 +0200 Subject: [PATCH 53/56] feat: refactor page summary creation logic for improved grouping and summarization --- .../page_summary_enhancer.py | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py index e5fd054..b39149d 100644 --- a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +++ b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py @@ -27,21 +27,41 @@ class PageSummaryEnhancer(SummaryEnhancer): BASE64_IMAGE_KEY = "base64_image" DEFAULT_PAGE_NR = 1 + + async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: + full_page_content = " ".join([piece.page_content for piece in page_pieces]) + summary = await self._summarizer.ainvoke(full_page_content, config) + meta = {key: value for key, value in page_pieces[0].metadata.items() if key != self.BASE64_IMAGE_KEY} + meta["id"] = sha256(str.encode(full_page_content)).hexdigest() + meta["related"] = meta["related"] + [piece.metadata["id"] for piece in page_pieces] + meta["type"] = ContentType.SUMMARY.value + + return Document(metadata=meta, page_content=summary) + + async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]: - # group infos by page, defaulting to page 1 if no page metadata - if self._chunker_settings: - filtered_information = [ - info for info in information if len(info.page_content) > self._chunker_settings.max_size - ] - else: - filtered_information = information - grouped = [ - [info for info in filtered_information if info.metadata.get("page", self.DEFAULT_PAGE_NR) == page] - for page in {info_piece.metadata.get("page", self.DEFAULT_PAGE_NR) for info_piece in filtered_information} - ] - - summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)] - return await gather(*summary_tasks) + distinct_pages = [] + for info in information: + if info.metadata.get("page", self.DEFAULT_PAGE_NR) not in distinct_pages: + distinct_pages.append(info.metadata.get("page", self.DEFAULT_PAGE_NR)) + + grouped = [] + for page in distinct_pages: + group = [] + for compare_info in information: + if compare_info.metadata.get("page", self.DEFAULT_PAGE_NR) == page: + group.append(compare_info) + if self._chunker_settings and len(" ".join([item.page_content for item in group])) < self._chunker_settings.max_size: + continue + grouped.append(group) + + summary_tasks = [self._asummarize_page_with_limit(info_group, config) for info_group in tqdm(grouped)] + summaries = await gather(*summary_tasks) + return information + summaries + + async def _asummarize_page_with_limit(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: + async with self._semaphore: + return await self._asummarize_page(page_pieces, config) async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: full_page_content = " ".join([piece.page_content for piece in page_pieces]) @@ -52,3 +72,4 @@ async def _asummarize_page(self, page_pieces: list[Document], config: Optional[R meta["type"] = ContentType.SUMMARY.value return Document(metadata=meta, page_content=summary) + From 442dda4a3b7c0a3ab287149ed59969c2a009baac Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 6 Jun 2025 12:20:43 +0200 Subject: [PATCH 54/56] refactor: remove redundant summarization method and streamline summary task creation --- .../impl/information_enhancer/page_summary_enhancer.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py index b39149d..a40a1fe 100644 --- a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +++ b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py @@ -55,14 +55,10 @@ async def _acreate_summary(self, information: list[Document], config: Optional[R continue grouped.append(group) - summary_tasks = [self._asummarize_page_with_limit(info_group, config) for info_group in tqdm(grouped)] + summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)] summaries = await gather(*summary_tasks) return information + summaries - async def _asummarize_page_with_limit(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: - async with self._semaphore: - return await self._asummarize_page(page_pieces, config) - async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: full_page_content = " ".join([piece.page_content for piece in page_pieces]) summary = await self._summarizer.ainvoke(full_page_content, config) @@ -72,4 +68,3 @@ async def _asummarize_page(self, page_pieces: list[Document], config: Optional[R meta["type"] = ContentType.SUMMARY.value return Document(metadata=meta, page_content=summary) - From cda5ff5b5019892ec6bfba1d8a632d49d9b69a68 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 6 Jun 2025 12:34:59 +0200 Subject: [PATCH 55/56] refactor: simplify summary creation logic by removing redundant variable assignment --- .../impl/information_enhancer/page_summary_enhancer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py index a40a1fe..d0aeeea 100644 --- a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +++ b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py @@ -56,8 +56,8 @@ async def _acreate_summary(self, information: list[Document], config: Optional[R grouped.append(group) summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)] - summaries = await gather(*summary_tasks) - return information + summaries + + return await gather(*summary_tasks) async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: full_page_content = " ".join([piece.page_content for piece in page_pieces]) From 14ca90a114986ec341e1a774dd792865467a2499 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 6 Jun 2025 12:46:38 +0200 Subject: [PATCH 56/56] refactor: remove redundant whitespace and streamline summary creation logic --- .../page_summary_enhancer.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py index d0aeeea..6adbdd0 100644 --- a/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +++ b/admin-api-lib/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py @@ -27,7 +27,6 @@ class PageSummaryEnhancer(SummaryEnhancer): BASE64_IMAGE_KEY = "base64_image" DEFAULT_PAGE_NR = 1 - async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: full_page_content = " ".join([piece.page_content for piece in page_pieces]) summary = await self._summarizer.ainvoke(full_page_content, config) @@ -38,7 +37,6 @@ async def _asummarize_page(self, page_pieces: list[Document], config: Optional[R return Document(metadata=meta, page_content=summary) - async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]: distinct_pages = [] for info in information: @@ -51,20 +49,13 @@ async def _acreate_summary(self, information: list[Document], config: Optional[R for compare_info in information: if compare_info.metadata.get("page", self.DEFAULT_PAGE_NR) == page: group.append(compare_info) - if self._chunker_settings and len(" ".join([item.page_content for item in group])) < self._chunker_settings.max_size: + if ( + self._chunker_settings + and len(" ".join([item.page_content for item in group])) < self._chunker_settings.max_size + ): continue grouped.append(group) summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)] return await gather(*summary_tasks) - - async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document: - full_page_content = " ".join([piece.page_content for piece in page_pieces]) - summary = await self._summarizer.ainvoke(full_page_content, config) - meta = {key: value for key, value in page_pieces[0].metadata.items() if key != self.BASE64_IMAGE_KEY} - meta["id"] = sha256(str.encode(full_page_content)).hexdigest() - meta["related"] = meta["related"] + [piece.metadata["id"] for piece in page_pieces] - meta["type"] = ContentType.SUMMARY.value - - return Document(metadata=meta, page_content=summary)