From e0b51ac6380ff447474eb24b75ef49b93ee88636 Mon Sep 17 00:00:00 2001 From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:32:58 -0500 Subject: [PATCH] add pyright and e2e tests --- .github/workflows/e2e_test.yml | 22 +++++++++++++ .github/workflows/pyright.yml | 26 +++++++++++++++ .vscode/launch.json | 4 +-- Docker/Dockerfile_user_code | 2 +- Docker/docker-compose-user-code.yaml | 8 ++++- Docker/entrypoint.sh | 4 +-- Docker/user_code_requirements.txt | 1 + docs/docs.md | 2 +- main.py | 33 +++++++++++++++++-- {code/lib => userCode}/__init__.py | 0 userCode/lib/__init__.py | 0 {code => userCode}/lib/classes.py | 6 ++-- {code => userCode}/lib/env.py | 0 {code => userCode}/lib/types.py | 0 {code => userCode}/lib/utils.py | 0 {code => userCode}/main.py | 15 +++++---- {code => userCode}/readme.md | 0 userCode/test/__init__.py | 0 .../test}/data/nquads_2024_09_23_20_05_20.nq | 0 userCode/test/test_e2e.py | 11 +++++++ 20 files changed, 114 insertions(+), 20 deletions(-) create mode 100644 .github/workflows/e2e_test.yml create mode 100644 .github/workflows/pyright.yml rename {code/lib => userCode}/__init__.py (100%) create mode 100644 userCode/lib/__init__.py rename {code => userCode}/lib/classes.py (90%) rename {code => userCode}/lib/env.py (100%) rename {code => userCode}/lib/types.py (100%) rename {code => userCode}/lib/utils.py (100%) rename {code => userCode}/main.py (97%) rename {code => userCode}/readme.md (100%) create mode 100644 userCode/test/__init__.py rename {test => userCode/test}/data/nquads_2024_09_23_20_05_20.nq (100%) create mode 100644 userCode/test/test_e2e.py diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml new file mode 100644 index 00000000..83bbcdd4 --- /dev/null +++ b/.github/workflows/e2e_test.yml @@ -0,0 +1,22 @@ +name: Run Python Script in Docker + +on: + push: + workflow_dispatch: # Allows manual triggering of the workflow + +jobs: + setup-docker-and-run: + runs-on: ubuntu-latest + + steps: + # Checkout the repository + - name: Checkout Code + uses: actions/checkout@v4 + + # Set up Docker + - name: Set up Docker + uses: docker/setup-buildx-action@v2 + + # Run the Python script that handles the build and execution + - name: Run Python Script + run: python3 main.py local && python3 main.py test diff --git a/.github/workflows/pyright.yml b/.github/workflows/pyright.yml new file mode 100644 index 00000000..b97abfe0 --- /dev/null +++ b/.github/workflows/pyright.yml @@ -0,0 +1,26 @@ +name: Pyright Type Checks + +on: + push: + workflow_dispatch: # Allows manual triggering of the workflow + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + cache: 'pip' + + - run: | + python -m venv .venv + source .venv/bin/activate + pip install -r Docker/user_code_requirements.txt + + - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH + + - uses: jakebailey/pyright-action@v2 + with: + pylance-version: latest-release + diff --git a/.vscode/launch.json b/.vscode/launch.json index 8767dd45..da5c2fa2 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,8 +11,8 @@ }, "pathMappings": [ { - "localRoot": "${workspaceFolder}/code", - "remoteRoot": "/opt/dagster/app/code" + "localRoot": "${workspaceFolder}/userCode", + "remoteRoot": "/opt/dagster/app/userCode" } ], "justMyCode": true, diff --git a/Docker/Dockerfile_user_code b/Docker/Dockerfile_user_code index b0df49f9..eb6222c2 100644 --- a/Docker/Dockerfile_user_code +++ b/Docker/Dockerfile_user_code @@ -13,7 +13,7 @@ ENV DAGSTER_DEBUG=$DAGSTER_DEBUG # configs and runtime code WORKDIR /opt/dagster/app -COPY code /opt/dagster/app/code +COPY userCode /opt/dagster/app/userCode COPY templates /opt/dagster/app/templates # Expose the necessary ports diff --git a/Docker/docker-compose-user-code.yaml b/Docker/docker-compose-user-code.yaml index cad44417..44b075aa 100644 --- a/Docker/docker-compose-user-code.yaml +++ b/Docker/docker-compose-user-code.yaml @@ -11,7 +11,13 @@ services: environment: DAGSTER_CURRENT_IMAGE: "dagster_user_code_image" volumes: - - ../code:/opt/dagster/app/code + - ../userCode:/opt/dagster/app/userCode + # When materialized via the UI, dagster runs the + # user code inside the webserver container + # However, if we are just running pytest we need to have direct + # access to the docker sock inside the user code container + - /var/run/docker.sock:/var/run/docker.sock + networks: - dagster_network env_file: "../.env" diff --git a/Docker/entrypoint.sh b/Docker/entrypoint.sh index c9adc9bd..7e03793b 100755 --- a/Docker/entrypoint.sh +++ b/Docker/entrypoint.sh @@ -11,8 +11,8 @@ fi if [ "$DAGSTER_DEBUG" = "true" ]; then echo "Starting dagster in debug mode and waiting for connection to debugpy" - exec python -m debugpy --configure-subProcess true --listen 0.0.0.0:5678 -m dagster dev -h 0.0.0.0 -p 3000 --python-file /opt/dagster/app/code/main.py -d /opt/dagster/app/code + exec python -m debugpy --configure-subProcess true --listen 0.0.0.0:5678 -m dagster dev -h 0.0.0.0 -p 3000 -m userCode.main else echo "Starting dagster code server" - exec dagster code-server start -h 0.0.0.0 -p 4000 --python-file /opt/dagster/app/code/main.py -d /opt/dagster/app/code + exec dagster code-server start -h 0.0.0.0 -p 4000 -m userCode.main fi diff --git a/Docker/user_code_requirements.txt b/Docker/user_code_requirements.txt index 56fa569f..c164801b 100644 --- a/Docker/user_code_requirements.txt +++ b/Docker/user_code_requirements.txt @@ -10,3 +10,4 @@ lxml # used for parsing sitemaps pyyaml # used for processing gleaner/nabu configs beautifulsoup4 # used for parsing sitemaps aiohttp +pytest diff --git a/docs/docs.md b/docs/docs.md index d7f8752a..2985fc7e 100644 --- a/docs/docs.md +++ b/docs/docs.md @@ -56,7 +56,7 @@ This repository is a refactor of the [gleanerio/scheduler](https://github.com/gl ## Gleaner and Nabu Notes -The current pipeline for gleaner/nabu operations in Dagster is as follows. All of the steps are inside Dagster [here](../code/main.py) with each being a separate asset. +The current pipeline for gleaner/nabu operations in Dagster is as follows. All of the steps are inside Dagster [here](../userCode/main.py) with each being a separate asset. Since Dagster uses docs as code, the best way to get the most accurate documentation is by opening up the local UI and looking at the asset description; this will source our code and the associated comments. diff --git a/main.py b/main.py index b78eef9d..ffd13ecd 100644 --- a/main.py +++ b/main.py @@ -13,15 +13,20 @@ """ -def run_subprocess(command: str): +def run_subprocess(command: str, returnStdoutInsteadOfPrint: bool = False): """Run a shell command and stream the output in realtime""" process = subprocess.Popen( - command, shell=True, stdout=sys.stdout, stderr=sys.stderr + command, + shell=True, + stdout=subprocess.PIPE if returnStdoutInsteadOfPrint else sys.stdout, + stderr=sys.stderr, ) - process.communicate() + stdout, _ = process.communicate() if process.returncode != 0: sys.exit(process.returncode) + return stdout.decode("utf-8") if returnStdoutInsteadOfPrint else None + def down(): """Stop the docker swarm stack""" @@ -90,7 +95,24 @@ def refresh(): ) +def test(): + """Run pytest inside the user code container""" + + # get the name of the container + containerName = run_subprocess( + "docker ps --filter name=geoconnex_crawler_dagster_user_code --format '{{.Names}}'", + returnStdoutInsteadOfPrint=True, + ) + if not containerName: + raise RuntimeError("Could not find the user code container to run pytest") + containerName = containerName.strip() + run_subprocess(f"docker exec -it {containerName} pytest") + + def main(): + # set DOCKER_CLI_HINTS false to avoid the advertisement message after every docker cmd + os.environ["DOCKER_CLI_HINTS"] = "false" + # make sure the user is in the same directory as this file file_dir = os.path.dirname(os.path.abspath(__file__)) if file_dir != os.getcwd(): @@ -120,6 +142,9 @@ def main(): "prod", help="Spin up the docker swarm stack with remote s3 and graphdb", ) + + subparsers.add_parser("test", help="Run pytest inside the user code container") + args = parser.parse_args() if args.command == "down": down() @@ -129,6 +154,8 @@ def main(): up(local=False, debug=False) elif args.command == "refresh": refresh() + elif args.command == "test": + test() else: parser.print_help() diff --git a/code/lib/__init__.py b/userCode/__init__.py similarity index 100% rename from code/lib/__init__.py rename to userCode/__init__.py diff --git a/userCode/lib/__init__.py b/userCode/lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/code/lib/classes.py b/userCode/lib/classes.py similarity index 90% rename from code/lib/classes.py rename to userCode/lib/classes.py index c92aaa7f..4e93bb38 100644 --- a/code/lib/classes.py +++ b/userCode/lib/classes.py @@ -2,7 +2,7 @@ from typing import Any from dagster import get_dagster_logger from minio import Minio -from urllib3 import HTTPResponse +from urllib3 import BaseHTTPResponse from .env import ( GLEANER_MINIO_SECRET_KEY, GLEANER_MINIO_ACCESS_KEY, @@ -42,8 +42,8 @@ def read(self, remote_path: str): logger.info(f"S3 SERVER : {self.endpoint}") logger.info(f"S3 PORT : {GLEANER_MINIO_PORT}") logger.info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - logger.debug(f"S3 object path : {remote_path}") - response: HTTPResponse = self.client.get_object( + logger.debug(f"S3 object path : {remote_path}∂") + response: BaseHTTPResponse = self.client.get_object( GLEANER_MINIO_BUCKET, remote_path ) data = response.read() diff --git a/code/lib/env.py b/userCode/lib/env.py similarity index 100% rename from code/lib/env.py rename to userCode/lib/env.py diff --git a/code/lib/types.py b/userCode/lib/types.py similarity index 100% rename from code/lib/types.py rename to userCode/lib/types.py diff --git a/code/lib/utils.py b/userCode/lib/utils.py similarity index 100% rename from code/lib/utils.py rename to userCode/lib/utils.py diff --git a/code/main.py b/userCode/main.py similarity index 97% rename from code/main.py rename to userCode/main.py index a9c19c1e..acb1d751 100644 --- a/code/main.py +++ b/userCode/main.py @@ -1,7 +1,7 @@ import asyncio from datetime import datetime from typing import Tuple -from aiohttp import ClientSession +from aiohttp import ClientSession, ClientTimeout from bs4 import BeautifulSoup from dagster import ( AssetCheckResult, @@ -25,8 +25,8 @@ import dagster_slack import requests import yaml -from lib.classes import S3 -from lib.utils import ( +from .lib.classes import S3 +from .lib.utils import ( remove_non_alphanumeric, run_scheduler_docker_image, slack_error_fn, @@ -34,7 +34,7 @@ ) from urllib.parse import urlparse -from lib.env import ( +from .lib.env import ( GLEANER_GRAPH_URL, GLEANER_HEADLESS_ENDPOINT, REMOTE_GLEANER_SITEMAP, @@ -133,7 +133,9 @@ def gleaner_links_are_valid(): dead_links: list[dict[str, Tuple[int, str]]] = [] async def validate_url(url: str): - async with ClientSession() as session: + # Geoconnex links generally take at absolute max 8 seconds if it is very large sitemap + # If it is above 12 seconds that is a good signal that something is wrong + async with ClientSession(timeout=ClientTimeout(total=12)) as session: resp = await session.get(url) if resp.status != 200: @@ -164,6 +166,7 @@ def docker_client_environment(): """Set up dagster by pulling both the gleaner and nabu images and moving the config files into docker configs""" get_dagster_logger().info("Getting docker client and pulling images: ") client = docker.DockerClient(version="1.43") + # check if the docker socket is available client.images.pull(GLEANERIO_GLEANER_IMAGE) client.images.pull(GLEANERIO_NABU_IMAGE) # we create configs as docker config objects so @@ -422,8 +425,6 @@ def crawl_entire_graph_schedule(): text_fn=slack_error_fn, default_status=DefaultSensorStatus.RUNNING, monitor_all_code_locations=True, - monitor_all_repositories=True, - monitored_jobs=[harvest_job], ) ], # Commented out but can uncomment if we want to send other slack msgs diff --git a/code/readme.md b/userCode/readme.md similarity index 100% rename from code/readme.md rename to userCode/readme.md diff --git a/userCode/test/__init__.py b/userCode/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/data/nquads_2024_09_23_20_05_20.nq b/userCode/test/data/nquads_2024_09_23_20_05_20.nq similarity index 100% rename from test/data/nquads_2024_09_23_20_05_20.nq rename to userCode/test/data/nquads_2024_09_23_20_05_20.nq diff --git a/userCode/test/test_e2e.py b/userCode/test/test_e2e.py new file mode 100644 index 00000000..4e4ca8ef --- /dev/null +++ b/userCode/test/test_e2e.py @@ -0,0 +1,11 @@ +from dagster import load_assets_from_modules, materialize +import userCode.main as main + + +def test_materialize_configs(): + result = materialize( + assets=load_assets_from_modules([main]), # type: ignore + selection=["nabu_config", "gleaner_config", "docker_client_environment"], + ) + assert result.success +