diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 28ad3531d..4592f4f08 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -1,4 +1,4 @@ -name: Docker Build and Push +name: Docker Build on: # Manual trigger only diff --git a/.github/workflows/qa-suite.yml b/.github/workflows/qa-suite.yml index 0e31712bb..657815720 100644 --- a/.github/workflows/qa-suite.yml +++ b/.github/workflows/qa-suite.yml @@ -28,7 +28,7 @@ jobs: - name: Set Up Python uses: actions/setup-python@v5 with: - python-version: '3.12.4' + python-version: '3.12.5' cache: 'pip' - name: Install Swirl run: ./install.sh diff --git a/.github/workflows/spell-checker.yml b/.github/workflows/spell-checker.yml index 59aca2409..3c61204c9 100644 --- a/.github/workflows/spell-checker.yml +++ b/.github/workflows/spell-checker.yml @@ -8,9 +8,26 @@ on: paths: - "docs/**" + pull_request: + # Run for all PRs to develop - means PR cannot merge until unit tests pass + branches: + - develop + - main + # Skip non-code changes + paths-ignore: + - '.github/**' + - 'integrations/**' + - 'swirl-infra/**' + - 'db.sqlite3.dist' # Allows manual run of this workflow from the Actions tab (on any branch) workflow_dispatch: +permissions: + contents: read + actions: read + checks: write + pull-requests: write + jobs: build: runs-on: ubuntu-latest diff --git a/.github/workflows/test-build-pipeline.yml b/.github/workflows/test-build-pipeline.yml index 059f86ee2..a655acda7 100644 --- a/.github/workflows/test-build-pipeline.yml +++ b/.github/workflows/test-build-pipeline.yml @@ -20,7 +20,7 @@ jobs: - name: Set Up Python uses: actions/setup-python@v5 with: - python-version: '3.12.4' + python-version: '3.12.5' cache: 'pip' - name: Install Swirl run: ./install.sh @@ -53,7 +53,7 @@ jobs: - name: Set Up Python uses: actions/setup-python@v5 with: - python-version: '3.12.4' + python-version: '3.12.5' cache: 'pip' - name: Install Swirl run: ./install.sh diff --git a/.github/workflows/testing-wip.yml b/.github/workflows/testing-wip.yml index e6fb64915..7ea047f2e 100644 --- a/.github/workflows/testing-wip.yml +++ b/.github/workflows/testing-wip.yml @@ -8,10 +8,14 @@ name: Testing WIP on: workflow_dispatch: inputs: + qa_image: # Input the QA Repo image to use + description: 'QA Repo image to use' + required: true + default: 'automated-tests-master' # Default image if none specified behave_tags: # Input the Behave tag(s) to run description: 'Behave tag(s) to run' required: true - default: 'estest' # Default tag if none specified + default: 'qa_suite,community' # Default tag if none specified jobs: wip-tests: @@ -37,7 +41,7 @@ jobs: - name: Set Up Python uses: actions/setup-python@v5 with: - python-version: '3.12.4' + python-version: '3.12.5' cache: 'pip' - name: Install Swirl run: ./install.sh @@ -65,7 +69,7 @@ jobs: echo "========" cat .env.qa echo "========" - docker run --net=host --env-file .env.qa -t swirlai/swirl-search-qa:automated-tests-master sh -c "behave --tags=${{ github.event.inputs.behave_tags }}" + docker run --net=host --env-file .env.qa -t swirlai/swirl-search-qa:${{ github.event.inputs.qa_image }} sh -c "behave --tags=${{ github.event.inputs.behave_tags }}" - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 95276bcf4..5dd320254 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,12 +1,28 @@ name: Unit Tests on: - # Manual trigger only + pull_request: + # Run for all PRs to develop - means PR cannot merge until unit tests pass + branches: + - develop + # Skip non-code changes + paths-ignore: + - '.github/**' + - 'integrations/**' + - 'swirl-infra/**' + - 'docs/**' + - 'README.md' + - 'db.sqlite3.dist' workflow_dispatch: -jobs: +permissions: + contents: read + actions: read + checks: write + pull-requests: write - build: +jobs: + unit-tests: runs-on: ubuntu-latest steps: @@ -15,14 +31,15 @@ jobs: - name: Set Up Python uses: actions/setup-python@v5 with: - python-version: '3.12.4' + python-version: '3.12.5' cache: 'pip' - name: Install Swirl run: ./install.sh - name: Install the Unit Tests run: ./install-test.sh - name: Run the Unit Tests - run: pytest + # generate a JUnit XML report for the test results + run: pytest --junitxml=reports/junit.xml --tb=short --ignore=integrations - name: Upload Log Files if: always() uses: actions/upload-artifact@v4 @@ -31,3 +48,9 @@ jobs: path: | logs/ /var/log/syslog* + + - name: Publish Unit Test Results + if: always() + uses: EnricoMi/publish-unit-test-result-action@v2 + with: + files: reports/junit.xml \ No newline at end of file diff --git a/DevUtils/docker/sw-start-sa.sh b/DevUtils/docker/sw-start-sa.sh deleted file mode 100644 index 9df1558dd..000000000 --- a/DevUtils/docker/sw-start-sa.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Start Redis in the background -redis-server & - -service nginx start - -echo "$PROG Copying: .env.dist -> .env" -cp .env.dist .env - -echo "$PROG Copying: db.sqlite3.dist -> db.sqlite3" -cp db.sqlite3.dist db.sqlite3 - -# Your original command to setup and start the application -rm -fr ./.swirl && python swirl.py setup && mkdir -p static/api/config && -/usr/bin/jq ".default" ./config-swirl-demo.db.json | sed -e "s//$MSAL_APP_ID/" \ - -e "s//$MSAL_TENANT_ID/" \ - -e "s/http:\/\//https:\/\//" \ - -e "s//$MSAL_CB_PORT/" \ - -e "s//$MSAL_HOST/" \ - -e "s/ws:/wss:/" > static/api/config/default && -python swirl.py start celery-worker celery-beats && -daphne -b 0.0.0.0 -p 8000 swirl_server.asgi:application - -# Keep the container running (if needed) -wait diff --git a/Dockerfile b/Dockerfile index f9550e4d9..621ffe1ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,59 +1,37 @@ -# Use an official Python runtime as a parent image -FROM python:3.12.4-slim-bookworm - -# Upgrade pip to the specified version or higher -RUN pip install --no-cache-dir --upgrade 'pip>=24.0' - -# try to upgrade to a more recent version of openssl -RUN apt-get update -RUN apt-get -y upgrade openssl - -# install jq -RUN apt-get -y install jq - -# RUN sudo echo 'nameserver 8.8.8.8'>/etc/resolv.conf -RUN apt-get update -y -RUN apt-get install apt-file -y -RUN apt-file update -RUN apt-get install -y python3-dev build-essential -RUN apt-get install -y procps -RUN apt-get install -y libpq-dev - -RUN pip install --no-cache-dir --upgrade pip -RUN pip install --no-cache-dir --upgrade grpcio - -ADD requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# install redis -RUN apt-get install -y redis-server - -# install requirements -RUN python -m spacy download en_core_web_lg -RUN python -m nltk.downloader stopwords -RUN python -m nltk.downloader punkt - -# Copy Swirl App to container -RUN mkdir /app +FROM python:3.12.5-slim-bookworm + +# Update, upgrade and install packages in a single RUN to reduce layers +RUN apt-get update && apt-get install -y \ + apt-file \ + build-essential \ + jq \ + libpq-dev \ + procps \ + python3-dev \ + redis-server \ +&& apt-file update \ +&& apt-get clean \ +&& rm -rf /var/lib/apt/lists/* + +# Copy application files (see .dockerignore for list of exclusions) +COPY . /app COPY ./db.sqlite3.dist /app/db.sqlite3 COPY ./.env.docker /app/.env -COPY ./install-ui.sh /app/install-ui.sh -ADD ./swirl /app/swirl +COPY ./download-nltk-resources.sh /app/ -# Install Galaxy UI -RUN mkdir -p /app/swirl/static/galaxy -COPY --from=swirlai/spyglass:latest /usr/src/spyglass/ui/dist/spyglass/browser/. /app/swirl/static/galaxy -COPY --from=swirlai/spyglass:latest /usr/src/spyglass/ui/config-swirl-demo.db.json /app/ +WORKDIR /app -ADD ./swirl_server /app/swirl_server -ADD ./SearchProviders /app/SearchProviders -ADD ./DevUtils /app/DevUtils -ADD ./Data /app/Data -ADD ./uploads /app/uploads -ADD ./swirl.py /app/swirl.py -ADD ./swirl_load.py /app/swirl_load.py -ADD ./manage.py /app/manage.py +# Optimize pip and Python installations +RUN pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt \ + && pip install --no-cache-dir --upgrade grpcio -WORKDIR /app +# Swirl install requirements +RUN python -m spacy download en_core_web_lg && \ + ./download-nltk-resources.sh + +# Install the Galaxy UI +COPY --from=swirlai/spyglass:preview /usr/src/spyglass/ui/dist/spyglass/browser/. /app/swirl/static/galaxy +COPY --from=swirlai/spyglass:preview /usr/src/spyglass/ui/config-swirl-demo.db.json /app/ EXPOSE 8000 diff --git a/db.sqlite3.dist b/db.sqlite3.dist index 422523e2e..73efcb3c7 100644 Binary files a/db.sqlite3.dist and b/db.sqlite3.dist differ diff --git a/docker-compose.yaml b/docker-compose.yaml index 0fe8dce8c..caf114ca6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,10 +1,20 @@ services: redis: - image: redis + image: redis:latest + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 app: image: swirlai/swirl-search:latest ports: - "8000:8000" + depends_on: + redis: + condition: service_healthy command: > sh -c 'rm -fr ./.swirl && python swirl.py setup && mkdir -p static/api/config && /usr/bin/jq ".default" ./config-swirl-demo.db.json | sed -e "s//$MSAL_APP_ID/" \ diff --git a/download-nltk-resources.sh b/download-nltk-resources.sh new file mode 100755 index 000000000..709fdb8dc --- /dev/null +++ b/download-nltk-resources.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +python -c "import nltk; nltk.download('stopwords')" +python -c "import nltk; nltk.download('punkt_tab')" diff --git a/install.sh b/install.sh index ef4b92e95..8c3aed96c 100755 --- a/install.sh +++ b/install.sh @@ -99,7 +99,6 @@ else fi echo "$PROG Downloading NLTK modules..." -python -m nltk.downloader stopwords -python -m nltk.downloader punkt +./download-nltk-resources.sh echo "$PROG If no errors occurred, run python swirl.py setup" diff --git a/requirements.txt b/requirements.txt index 3adf36242..e89f5c4a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,22 @@ +## The following requirements were added by pip freeze: amqp==5.2.0 annotated-types==0.7.0 -anyio==4.4.0 +anyio==4.5.0 asgiref==3.8.1 asn1crypto==1.5.1 -attrs==23.2.0 -autobahn==23.6.2 -Automat==22.10.0 +attrs==24.2.0 +autobahn==24.4.2 +Automat==24.8.1 +azure-core==1.31.0 beautifulsoup4==4.12.3 billiard==4.2.0 blis==0.7.11 bs4==0.0.2 -cachetools==5.3.3 +cachetools==5.5.0 catalogue==2.0.10 -celery==5.4.0 -certifi==2024.7.4 -cffi==1.16.0 +celery==5.5.0b3 +certifi==2024.8.30 +cffi==1.17.1 channels==4.1.0 channels-redis==4.2.0 chardet==5.2.0 @@ -23,17 +25,17 @@ click==8.1.7 click-didyoumean==0.3.1 click-plugins==1.1.1 click-repl==0.3.0 -cloudpathlib==0.18.1 +cloudpathlib==0.19.0 confection==0.1.5 constantly==23.10.4 -cron-descriptor==1.4.3 -cryptography==42.0.8 +cron-descriptor==1.4.5 +cryptography==43.0.1 cssselect==1.2.0 cymem==2.0.8 daphne==4.1.2 distro==1.9.0 -Django==5.0.7 -django-celery-beat==2.6.0 +Django==5.1.1 +django-celery-beat==2.7.0 django-environ==0.11.2 django-restframework==0.0.1 django-timezone-field==7.0 @@ -41,127 +43,135 @@ djangorestframework==3.15.2 dnspython==2.6.1 docutils==0.21.2 drf-spectacular==0.27.2 -elastic-transport==8.13.1 -elasticsearch==8.14.0 +elastic-transport==8.15.0 +elasticsearch==8.15.1 en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc Events==0.5 -filelock==3.15.4 -fsspec==2024.6.1 -google-api-core==2.19.1 -google-auth==2.32.0 +filelock==3.16.1 +fsspec==2024.9.0 +google-api-core==2.20.0 +google-auth==2.35.0 google-cloud-bigquery==3.25.0 google-cloud-core==2.4.1 -google-crc32c==1.5.0 -google-resumable-media==2.7.1 -googleapis-common-protos==1.63.2 -grpcio==1.64.1 -grpcio-status==1.64.1 -grpcio-tools==1.64.1 +google-crc32c==1.6.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.65.0 +grpcio==1.66.1 +grpcio-status==1.66.1 +grpcio-tools==1.66.1 h11==0.14.0 h2==4.1.0 hpack==4.0.0 httpcore==1.0.5 -httpx==0.27.0 -huggingface-hub==0.23.4 +httpx==0.27.2 +huggingface-hub==0.25.0 hyperframe==6.0.1 hyperlink==21.0.0 -idna==3.7 -incremental==22.10.0 +idna==3.10 +incremental==24.7.2 inflection==0.5.1 Jinja2==3.1.4 +jiter==0.5.0 joblib==1.4.2 jsonpath-ng==1.6.1 jsonschema==4.23.0 jsonschema-specifications==2023.12.1 -kombu==5.3.7 +kombu==5.4.2 langcodes==3.4.0 language_data==1.2.0 -lxml==5.2.2 -lxml_html_clean==0.1.1 +lxml==5.3.0 +lxml_html_clean==0.2.2 marisa-trie==1.2.0 markdown-it-py==3.0.0 MarkupSafe==2.1.5 mdurl==0.1.2 mpmath==1.3.0 -msal==1.29.0 -msgpack==1.0.8 +msal==1.31.0 +msgpack==1.1.0 murmurhash==1.0.10 natsort==8.4.0 networkx==3.3 -nltk==3.8.1 +nltk==3.9.1 numpy==1.26.4 -openai==1.35.13 -opensearch-py==2.6.0 -oracledb==2.2.1 +openai==1.46.1 +opensearch-py==2.7.1 +oracledb==2.4.1 packaging==24.1 -pandas==2.2.2 +pandas==2.2.3 +phonenumbers==8.13.45 pika==1.3.2 -pinecone-client==4.1.2 +pinecone-client==5.0.1 +pinecone-plugin-inference==1.1.0 pinecone-plugin-interface==0.0.7 -platformdirs==4.2.2 +platformdirs==4.3.6 ply==3.11 -portalocker==2.10.0 +portalocker==2.10.1 preshed==3.0.9 +presidio_analyzer==2.2.355 +presidio_anonymizer==2.2.355 prompt_toolkit==3.0.47 proto-plus==1.24.0 -protobuf==5.27.2 +protobuf==5.28.2 psycopg2-binary==2.9.9 pyahocorasick==2.1.0 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 pycparser==2.22 -pydantic==2.8.2 -pydantic_core==2.20.1 +pycryptodome==3.20.0 +pydantic==2.9.2 +pydantic_core==2.23.4 Pygments==2.18.0 -PyJWT==2.8.0 -pymongo==4.8.0 -pyOpenSSL==24.1.0 +PyJWT==2.9.0 +pymongo==4.9.1 +pyOpenSSL==24.2.1 python-crontab==3.2.0 python-dateutil==2.9.0.post0 -pytz==2024.1 -PyYAML==6.0.1 +pytz==2024.2 +PyYAML==6.0.2 qdrant-client==1.10.0 readability-lxml==0.8.1 -redis==5.0.7 +redis==5.0.8 referencing==0.35.1 -regex==2024.5.15 +regex==2024.9.11 requests==2.32.3 -rich==13.7.1 -rpds-py==0.19.0 +requests-file==2.1.0 +rich==13.8.1 +rpds-py==0.20.0 rsa==4.9 -safetensors==0.4.3 +safetensors==0.4.5 service-identity==24.1.0 -setuptools==70.3.0 +setuptools==75.1.0 shellingham==1.5.4 six==1.16.0 smart-open==7.0.4 sniffio==1.3.1 -snowflake-connector-python==3.11.0 +snowflake-connector-python==3.12.2 sortedcontainers==2.4.0 -soupsieve==2.5 +soupsieve==2.6 spacy==3.7.5 spacy-legacy==3.0.12 spacy-loggers==1.0.5 -sqlparse==0.5.0 +sqlparse==0.5.1 srsly==2.4.8 statistics==1.0.3.5 -sympy==1.13.0 +sympy==1.13.3 textblob==0.18.0.post0 thinc==8.2.5 tika==2.6.0 tiktoken==0.7.0 +tldextract==5.1.2 tokenizers==0.19.1 -tomlkit==0.13.0 -torch==2.3.1 -tqdm==4.66.4 -transformers==4.42.3 -Twisted==24.3.0 +tomlkit==0.13.2 +torch==2.4.1 +tqdm==4.66.5 +transformers==4.44.2 +Twisted==24.7.0 txaio==23.1.1 -typer==0.12.3 +typer==0.12.5 typing_extensions==4.12.2 tzdata==2024.1 uritemplate==4.1.1 -urllib3==2.2.2 +urllib3==2.2.3 vine==5.1.0 wasabi==1.1.3 wcwidth==0.2.13 @@ -169,4 +179,4 @@ weasel==0.4.1 whitenoise==6.7.0 wrapt==1.16.0 xmltodict==0.13.0 -zope.interface==6.4.post2 +zope.interface==7.0.3 diff --git a/swirl/banner.py b/swirl/banner.py index 0894695c5..e75103274 100644 --- a/swirl/banner.py +++ b/swirl/banner.py @@ -10,9 +10,9 @@ class bcolors: ENDC = '\033[0m' BOLD = '\033[1m' -SWIRL_VERSION = '3.6.0.1' +SWIRL_VERSION = '3.8.0.0' -SWIRL_BANNER_TEXT = "__S_W_I_R_L__3_._6_._0_._1_____________________________________________________" +SWIRL_BANNER_TEXT = "__S_W_I_R_L__3_._8_._0_._0__________________________________________________________" SWIRL_BANNER = f'{bcolors.BOLD}{SWIRL_BANNER_TEXT}{bcolors.ENDC}' ############################################# diff --git a/swirl/connectors/verify_ssl_common.py b/swirl/connectors/verify_ssl_common.py index 23942457a..5e6c2c548 100644 --- a/swirl/connectors/verify_ssl_common.py +++ b/swirl/connectors/verify_ssl_common.py @@ -34,7 +34,7 @@ def get_creds(self, def_verify_certs=False): for cre in cred_list: if cre.startswith('bearer='): - # handle this speacial becauase tokens have '=' sign in them + # handle this special becauase tokens have '=' sign in them bearer = cre[len('bearer='):] if not bearer: self.log_invalid_credentials() diff --git a/swirl/models.py b/swirl/models.py index 03e940ac4..5cf16137f 100644 --- a/swirl/models.py +++ b/swirl/models.py @@ -99,7 +99,8 @@ class SearchProvider(models.Model): ('GenAIQueryProcessor', 'GenAIQueryProcessor'), ('AdaptiveQueryProcessor', 'AdaptiveQueryProcessor'), ('NoModQueryProcessor', 'NoModQueryProcessor'), - ('SpellcheckQueryProcessor', 'SpellcheckQueryProcessor') + ('SpellcheckQueryProcessor', 'SpellcheckQueryProcessor'), + ('RemovePIIQueryProcessor', 'RemovePIIQueryProcessor'), ] query_processors = models.JSONField(default=getSearchProviderQueryProcessorsDefault, blank=True) query_mappings = models.CharField(max_length=2048, default=str, blank=True) @@ -114,7 +115,8 @@ class SearchProvider(models.Model): ('CleanTextResultProcessor','CleanTextResultProcessor'), ('RequireQueryStringInTitleResultProcessor','RequireQueryStringInTitleResultProcessor'), ('AutomaticPayloadMapperResultProcessor', 'AutomaticPayloadMapperResultProcessor'), - ('CosineRelevancyResultProcessor','CosineRelevancyResultProcessor') + ('CosineRelevancyResultProcessor','CosineRelevancyResultProcessor'), + ('RedactPIIResultProcessor', 'RedactPIIResultProcessor'), ] response_mappings = models.CharField(max_length=2048, default=str, blank=True) @@ -177,6 +179,7 @@ class Search(models.Model): ('DropIrrelevantPostResultProcessor','DropIrrelevantPostResultProcessor'), ('DedupeByFieldPostResultProcessor', 'DedupeByFieldPostResultProcessor'), ('DedupeBySimilarityPostResultProcessor', 'DedupeBySimilarityPostResultProcessor'), + ('RedactPIIPostResultProcessor', 'RedactPIIPostResultProcessor'), ] post_result_processors = models.JSONField(default=getSearchPostResultProcessorsDefault, blank=True) result_url = models.CharField(max_length=2048, default='/swirl/results?search_id=%d&result_mixer=%s', blank=True) diff --git a/swirl/processors/__init__.py b/swirl/processors/__init__.py index d4282054d..202db90c5 100644 --- a/swirl/processors/__init__.py +++ b/swirl/processors/__init__.py @@ -13,9 +13,9 @@ from swirl.processors.gen_ai_query import * from swirl.processors.transform_query_processor import * from swirl.processors.date_finder import * +from swirl.processors.remove_pii import * from swirl.models import Search, SearchProvider - def alloc_processor(processor): if not processor: logger.error("blank processor") diff --git a/swirl/processors/rag.py b/swirl/processors/rag.py index 28aabea5b..ac5261def 100644 --- a/swirl/processors/rag.py +++ b/swirl/processors/rag.py @@ -113,7 +113,7 @@ def background_process(self): if result.json_results: for item in result.json_results: if rag_query_items: - if str(item['swirl_id']) in rag_query_items: + if 'swirl_id' in item and str(item['swirl_id']) in rag_query_items: rag_item_list.append(item) item['provider_id'] = result.provider_id elif 'swirl_score' in item: diff --git a/swirl/processors/remove_pii.py b/swirl/processors/remove_pii.py new file mode 100644 index 000000000..6f7c9b65c --- /dev/null +++ b/swirl/processors/remove_pii.py @@ -0,0 +1,186 @@ +''' +@author: Sid Probstein +@contact: sid@swirl.today +''' + +from django.conf import settings + +from celery.utils.log import get_task_logger +logger = get_task_logger(__name__) + +from swirl.processors.generic import QueryProcessor, ResultProcessor, PostResultProcessor + +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine, OperatorConfig + +# Instantiate Presidio Analyzer and Anonymizer +analyzer = AnalyzerEngine() +anonymizer = AnonymizerEngine() + +############################################# + +def redact_pii(text: str, query_string=None) -> str: + """ + Redacts PII from the given text string using Presidio. + + :param text: The input string (either query or result) to clean. + :return: The text with PII redacted. + """ + + return remove_pii(text, query_string, redact=True) + +from swirl.processors.utils import remove_tags, highlight_list + +def remove_pii(text: str, query_string=None, redact=False) -> str: + """ + Removes PII from the given text string using Presidio. + + By default, Presidio redacts entities, replacing it with . + The Presidio "redact" option removes the PII entirely. + In SWIRL, remove means "remove the PII" and "redact" means "replace it with ". + + :param text: The input string (either query or result) to clean. + :return: The text with PII removed. + """ + + untagged_text = remove_tags(text) + pii_entities = analyzer.analyze(text=untagged_text, language='en') + + if not pii_entities: + return text + + operators = {"DEFAULT": OperatorConfig("redact")} + if redact: + # if specified + operators = {"DEFAULT": OperatorConfig("replace")} + + anonymized_result = anonymizer.anonymize( + text=untagged_text, + analyzer_results=pii_entities, + operators=operators + ) + + anonymized_text = anonymized_result.text + + if redact: + anonymized_text = anonymized_text.replace('<', '[').replace('>', ']') + + if query_string: + highlighted_anonymized_text = highlight_list(anonymized_text, query_string.split()) + return highlighted_anonymized_text + + return anonymized_text + +############################################# + +class RemovePIIQueryProcessor(QueryProcessor): + """ + A SWIRL metasearch query processor that removes PII from search queries. + """ + + type = 'RemovePIIQueryProcessor' + + def process(self) -> str: + """ + :return: The processed query with PII removed. + """ + + # Remove PII from the query + cleaned_query = remove_pii(self.query_string) + + return cleaned_query + +############################################# + +class RedactPIIResultProcessor(ResultProcessor): + """ + A SWIRL result processor that removes PII from the search results. + Meant to be run after CosineResultProcessor. + """ + + type = "RemovePIIResultProcessor" + + def process(self) -> int: + """ + :return: The number of modified results. + """ + logger.debug(f"Processing {len(self.results)} results for PII removal.") + + modified = 0 + for item in self.results: + pii_modified = False + + # Remove PII from 'title' and 'body' fields of each result + if 'title' in item: + cleaned_title = redact_pii(item['title'], self.query_string) + if cleaned_title != item['title']: + item['title'] = cleaned_title + pii_modified = True + + if 'body' in item: + cleaned_body = redact_pii(item['body'], self.query_string) + if cleaned_body != item['body']: + item['body'] = cleaned_body + pii_modified = True + + if 'payload' in item: + for key in item['payload']: + if type(item['payload'][key]) is not str: + continue + cleaned_payload = redact_pii(item['payload'][key], self.query_string) + if cleaned_payload != item['payload'][key]: + item['payload'][key] = cleaned_payload + pii_modified = True + + if pii_modified: + modified += 1 + + self.processed_results = self.results + self.modified = modified + logger.debug(f"PII removal complete. {self.modified} results modified.") + + return self.modified + +############################################# + +class RedactPIIPostResultProcessor(PostResultProcessor): + """ + A SWIRL result processor that removes PII from all results. + """ + + type = "RemovePIIPostResultProcessor" + + def process(self) -> int: + """ + :return: The number of modified results. + """ + + modified = 0 + + for result in self.results: + for item in result.json_results: + pii_modified = False + if 'title' in item: + cleaned_title = redact_pii(item['title'], self.search.query_string_processed) + if cleaned_title != item['title']: + item['title'] = cleaned_title + pii_modified = True + if 'body' in item: + cleaned_body = redact_pii(item['body'], self.search.query_string_processed) + if cleaned_body != item['body']: + item['body'] = cleaned_body + pii_modified = True + if 'payload' in item: + for key in item['payload']: + if type(item['payload'][key]) is not str: + continue + cleaned_payload = redact_pii(item['payload'][key], self.search.query_string_processed) + if cleaned_payload != item['payload'][key]: + item['payload'][key] = cleaned_payload + pii_modified = True + if pii_modified: + modified += 1 + result.save() + + self.results_updated = modified + return self.results_updated diff --git a/swirl/rag_prompt.py b/swirl/rag_prompt.py index e2c7380f3..eb4ac4ed4 100644 --- a/swirl/rag_prompt.py +++ b/swirl/rag_prompt.py @@ -29,6 +29,18 @@ def __init__(self, query, max_tokens, model): self._last_chunk_status = RAG_PROMPT_CHUNK_OK self._model_encoding = tiktoken.encoding_for_model(model) + self._prompt_footer = ( + f"\n\n\n\n--- Final Instructions ---\nIn your response do not assume people with vastly different work histories are the same person. " + f"If the query appears to be a proper name, focus on answering the question, 'Who is?' or 'What is?', as appropriate. " + f"If the query appears to be a question, then try to answer it. " + f"For the list of sources, use the HTML tags and format in the example below, do not generate duplicate entries, one entry per source.:\n" + f"\n

" + f"\n
Sources:" + f"\n
example description 1     example URL or source name 1" + f"\n
example description 2     example URL or source name 2" + f"\n

" + f"\n\nEnclose your response in HTML tags

and insert a
HTML tag every two sentences." + ) def get_num_tokens(self): return self._num_tokens @@ -121,7 +133,7 @@ def put_chunk(self, chunk, url, type, filter_file_type=True): def get_promp_text(self): logger.info(f'{self} : max_tokens:{self._max_tokens} num_tokens {self.get_num_tokens()} is_full:{self.is_full()}') - return self._prompt_text + return self._prompt_text + self._prompt_footer def get_role_system_guide_text(self): return MODEL_DEFAULT_SYSTEM_GUIDE diff --git a/swirl/tests/tests.py b/swirl/tests/tests.py index 730e26807..351970ccb 100644 --- a/swirl/tests/tests.py +++ b/swirl/tests/tests.py @@ -964,7 +964,7 @@ def test_query_transform_viewset_crud(api_client, test_suser, test_suser_pw, qrx qrx_record_1['config_content'] = "# This is an update\n# column1, colum2\nmobiles; ombile; mo bile, mobile\ncheapest smartphones, cheap smartphone" purl = reverse('update', kwargs={'pk': 1}) response = api_client.put(purl, data=qrx_record_1, format='json') - assert response.status_code == 201, 'Expected HTTP status code 201' + assert response.status_code == 200, 'Expected HTTP status code 200' response = api_client.get(reverse('querytransforms/list')) assert response.status_code == 200, 'Expected HTTP status code 200' assert len(response.json()) == 1, 'Expected 1 transform' diff --git a/swirl/views.py b/swirl/views.py index 9dfeb58c7..c2d336084 100644 --- a/swirl/views.py +++ b/swirl/views.py @@ -313,7 +313,7 @@ def update(self, request, pk=None): serializer.is_valid(raise_exception=True) # security review for 1.7 - OK, saved with owner serializer.save(owner=self.request.user) - return Response(serializer.data, status=status.HTTP_201_CREATED) + return Response(serializer.data, status=status.HTTP_200_OK) ######################################## @@ -332,6 +332,8 @@ def destroy(self, request, pk=None): searchprovider.delete() return Response('SearchProvider Object Deleted', status=status.HTTP_410_GONE) + def partial_update(self, request, pk=None): + return self.update(request, pk) ######################################## ######################################## @@ -612,7 +614,7 @@ def update(self, request, pk=None): # search_task.delay(search.id, Authenticator().get_session_data(request)) logger.info(f"{request.user} search_put {search.id}") run_search(search.id, Authenticator().get_session_data(request), request=request) - return Response(serializer.data, status=status.HTTP_201_CREATED) + return Response(serializer.data, status=status.HTTP_200_OK) ######################################## @@ -632,6 +634,9 @@ def destroy(self, request, pk=None): search.delete() return Response('Search Object Deleted', status=status.HTTP_410_GONE) + def partial_update(self, request, pk=None): + return self.update(request, pk) + ######################################## ######################################## @@ -765,7 +770,7 @@ def update(self, request, pk=None): serializer.is_valid(raise_exception=True) # security review for 1.7 - OK, saved with owner serializer.save(owner=self.request.user) - return Response(serializer.data, status=status.HTTP_201_CREATED) + return Response(serializer.data, status=status.HTTP_200_OK) ######################################## @@ -785,6 +790,9 @@ def destroy(self, request, pk=None): result.delete() return Response('Result Object Deleted!', status=status.HTTP_410_GONE) + def partial_update(self, request, pk=None): + return self.update(request, pk) + ######################################## ######################################## @@ -894,7 +902,7 @@ def update(self, request, pk=None): serializer.is_valid(raise_exception=True) # security review for 1.7 - OK, saved with owner serializer.save(owner=self.request.user) - return Response(serializer.data, status=status.HTTP_201_CREATED) + return Response(serializer.data, status=status.HTTP_200_OK) ######################################## @@ -910,7 +918,10 @@ def destroy(self, request, pk=None): searchprovider = QueryTransform.objects.get(pk=pk) searchprovider.delete() - return Response('QueryTranformation Object Deleted', status=status.HTTP_410_GONE) + return Response('QueryTransformation Object Deleted', status=status.HTTP_410_GONE) + + def partial_update(self, request, pk=None): + return self.update(request, pk) def query_transform_form(request): if request.method == 'POST':