diff --git a/.config/coveragerc b/.config/coveragerc new file mode 100644 index 0000000..3a3a813 --- /dev/null +++ b/.config/coveragerc @@ -0,0 +1,12 @@ +[report] +omit = + */run.py + */python?.?/* + */venv/* + */site-packages/* + */tests/* + *__init__* + */_version.py + +exclude_lines = + if __name__ == '__main__': diff --git a/.config/pre-commit-config.yaml b/.config/pre-commit-config.yaml new file mode 100644 index 0000000..ab31cf6 --- /dev/null +++ b/.config/pre-commit-config.yaml @@ -0,0 +1,33 @@ +default_language_version: + python: python3.12 +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + exclude: test_scraper_.*\.json + - id: check-ast + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.1 + hooks: + # Run the linter. + - id: ruff + args: [--config, .config/ruff.toml, --fix] + # Run the formatter. + - id: ruff-format + args: [--config, .config/ruff.toml] + - repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.2.37 + hooks: + # Run the pip compile + - id: pip-compile + name: pip-compile requirements.txt + files: pyproject.toml + args: [ pyproject.toml, --resolver=backtracking, --upgrade, -q, + -o, requirements.txt ] + - id: pip-compile + name: pip-compile requirements-test.txt + files: pyproject.toml + args: [ pyproject.toml, --resolver=backtracking, --upgrade, -q, + --extra, test, -c, requirements.txt, -o, requirements-test.txt ] diff --git a/.config/pytest.ini b/.config/pytest.ini new file mode 100644 index 0000000..bb28fd3 --- /dev/null +++ b/.config/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +pythonpath = ../src +addopts = "--color=yes" +log_cli = 1 diff --git a/.config/ruff.toml b/.config/ruff.toml new file mode 100644 index 0000000..ce3b3f2 --- /dev/null +++ b/.config/ruff.toml @@ -0,0 +1,14 @@ +line-length = 90 +exclude = ["_version.py"] + +[lint] +# List of rules: https://docs.astral.sh/ruff/rules/ +select = [ + "E", # pycodestyle - default + "F", # pyflakes - default + "I" # isort +] + +[lint.isort] +known-local-folder = ["hdx.scraper.wfp_rainfall"] +known-third-party = ["hdx.api", "hdx.location", "hdx.data", "hdx.database", "hdx.facades", "hdx.scraper", "hdx.utilities"] diff --git a/.github/workflows/run-python-script.yaml b/.github/workflows/run-python-script.yaml new file mode 100644 index 0000000..3fa6d6d --- /dev/null +++ b/.github/workflows/run-python-script.yaml @@ -0,0 +1,53 @@ +# This workflow will install Python dependencies and run the script + +name: Run script + +on: + workflow_dispatch: # add run button in github + schedule: + - cron: "32 10 * * *" + +jobs: + run: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.x + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install . + - name: Run script + env: + HDX_SITE: ${{ vars.HDX_SITE }} + HDX_KEY: ${{ secrets.HDX_BOT_SCRAPERS_API_TOKEN }} + PREPREFIX: ${{ secrets.HDX_PIPELINE_PREPREFIX }} + USER_AGENT: ${{ vars.USER_AGENT }} + EXTRA_PARAMS: ${{ vars.EXTRA_PARAMS }} + run: | + python -m hdx.scraper.wfp_rainfall + - name: Send mail + if: failure() + uses: dawidd6/action-send-mail@v3 + with: + server_address: ${{secrets.HDX_PIPELINE_EMAIL_SERVER}} + server_port: ${{secrets.HDX_PIPELINE_EMAIL_PORT}} + username: ${{secrets.HDX_PIPELINE_EMAIL_USERNAME}} + password: ${{secrets.HDX_PIPELINE_EMAIL_PASSWORD}} + subject: "FAILED: ${{github.repository}} run job" + body: GitHub Actions run job for ${{github.repository}} failed! + to: ${{vars.HDX_PIPELINE_EMAIL_LIST}} + from: ${{secrets.HDX_PIPELINE_EMAIL_FROM}} + + workflow-keepalive: + if: github.event_name == 'schedule' + runs-on: ubuntu-latest + permissions: + actions: write + steps: + - uses: liskin/gh-workflow-keepalive@v1 diff --git a/.github/workflows/run-python-tests.yaml b/.github/workflows/run-python-tests.yaml new file mode 100644 index 0000000..53f60ac --- /dev/null +++ b/.github/workflows/run-python-tests.yaml @@ -0,0 +1,49 @@ +# This workflow will install Python dependencies, lint and run tests +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Run tests + +on: + workflow_dispatch: # add run button in github + push: + branches-ignore: + - gh-pages + - 'dependabot/**' + pull_request: + branches-ignore: + - gh-pages + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install Hatch + uses: pypa/hatch@install + - name: Test with hatch/pytest + env: + HDX_KEY_TEST: ${{ secrets.HDX_BOT_SCRAPERS_API_TOKEN }} + GSHEET_AUTH: ${{ secrets.GSHEET_AUTH }} + run: | + hatch test + - name: Check styling + if: always() + run: | + hatch fmt --check + - name: Publish Unit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + junit_files: test-results.xml + - name: Publish in Coveralls + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + flag-name: tests + format: lcov diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b94498d --- /dev/null +++ b/.gitignore @@ -0,0 +1,173 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +## Project + +# Directory where the scraper caches data +saved_data/ + +# Version file +**/_version.py + +# Mac files +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3c8d5c6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 UN-OCHA Humanitarian Data Exchange Project + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 86879fb..bc5a85b 100644 --- a/README.md +++ b/README.md @@ -1 +1,103 @@ -# hdx-scraper-wfp-rainfall \ No newline at end of file +# Collector for WFP Rainfall Datasets +[![Build Status](https://github.com/OCHA-DAP/hdx-scraper-wfp-rainfall/actions/workflows/run-python-tests.yaml/badge.svg)](https://github.com/OCHA-DAP/hdx-scraper-wfp-rainfall/actions/workflows/run-python-tests.yaml) +[![Coverage Status](https://coveralls.io/repos/github/OCHA-DAP/hdx-scraper-wfp-rainfall/badge.svg?branch=main&ts=1)](https://coveralls.io/github/OCHA-DAP/hdx-scraper-wfp-rainfall?branch=main) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) + +This script ... + +## Development + +### Environment + +Development is currently done using Python 3.12. We recommend using a virtual +environment such as ``venv``: + + python3.12 -m venv venv + source venv/bin/activate + +In your virtual environment, please install all packages for +development by running: + + pip install -r requirements.txt + +### Installing and running + + +For the script to run, you will need to have a file called +.hdx_configuration.yaml in your home directory containing your HDX key, e.g.: + + hdx_key: "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" + hdx_read_only: false + hdx_site: prod + + You will also need to supply the universal .useragents.yaml file in your home + directory as specified in the parameter *user_agent_config_yaml* passed to + facade in run.py. The collector reads the key **hdx-scraper-wfp-rainfall** as specified + in the parameter *user_agent_lookup*. + + Alternatively, you can set up environment variables: `USER_AGENT`, `HDX_KEY`, +`HDX_SITE`, `EXTRA_PARAMS`, `TEMP_DIR`, and `LOG_FILE_ONLY`. + +To install and run, execute: + + pip install . + python -m hdx.scraper.wfp_rainfall + +## Environment + +Development is currently done using Python 3.11. We recommend using a virtual +environment such as ``venv``: + + python3.12 -m venv venv + source venv/bin/activate + +### Pre-commit + +Be sure to install `pre-commit`, which is run every time +you make a git commit: + +```shell +pip install pre-commit +pre-commit install +``` + +The configuration file for this project is in a +non-start location. Thus, you will need to edit your +`.git/hooks/pre-commit` file to reflect this. Change +the first line that begins with `ARGS` to: + + ARGS=(hook-impl --config=.config/pre-commit-config.yaml --hook-type=pre-commit) + +With pre-commit, all code is formatted according to +[black]("https://github.com/psf/black") and +[ruff]("https://github.com/charliermarsh/ruff") guidelines. + +To check if your changes pass pre-commit without committing, run: + + pre-commit run --all-files --config=.config/pre-commit-config.yaml + +### Testing + +Ensure you have the required packages to run the tests: + + pip install -r requirements-test.txt + +To run the tests and view coverage, execute: + +` pytest -c .config/pytest.ini --cov hdx --cov-config .config/coveragerc +` +### Packages + +[pip-tools](https://github.com/jazzband/pip-tools) is used for +package management. If you’ve introduced a new package to the +source code please add it to the `dependencies` section of +`pyproject.toml` with any known version constraints. + +For adding packages for testing, add them to +the `test` sections under `[project.optional-dependencies]`. + +Any changes to the dependencies will be automatically reflected in +`requirements.txt` and `requirements-test.txt` with `pre-commit`, +but you can re-generate the file without committing by executing: + + pre-commit run pip-compile --all-files --config=.config/pre-commit-config.yaml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b59f7ed --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,71 @@ +######################### +# Project Configuration # +######################### + +# Project name and version needed to run tests + +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "hdx-scraper-wfp-rainfall" +requires-python = ">=3.12" +dependencies = [ + "hdx-python-api", + "hdx-python-utilities", +] + +dynamic = ["version"] + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov" +] +dev = ["pre-commit"] + +######### +# Hatch # +######### + +# Build + +[tool.hatch.build.targets.wheel] +packages = ["src/hdx"] + +[tool.hatch.build.hooks.vcs] +version-file = "src/hdx/scraper/wfp_rainfall/_version.py" + +[tool.hatch.metadata] +allow-direct-references = true + +# Versioning + +[tool.hatch.version] +source = "vcs" + +[tool.hatch.version.raw-options] +local_scheme = "no-local-version" +version_scheme = "python-simplified-semver" + +# Tests + +[tool.hatch.envs.hatch-test] +features = ["test"] + +[[tool.hatch.envs.hatch-test.matrix]] +python = ["3.12"] + +[tool.hatch.envs.hatch-test.scripts] +run = """ + pytest -c .config/pytest.ini --rootdir=. --junitxml=test-results.xml \ + --cov --cov-config=.config/coveragerc --no-cov-on-fail \ + --cov-report=lcov --cov-report=term-missing + """ + +[tool.hatch.envs.hatch-static-analysis.scripts] +format-check = ["ruff format --config .config/ruff.toml --check --diff {args:.}",] +format-fix = ["ruff format --config .config/ruff.toml {args:.}",] +lint-check = ["ruff check --config .config/ruff.toml {args:.}",] +lint-fix = ["ruff check --config .config/ruff.toml --fix {args:.}",] diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..07c4f6e --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,374 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --resolver=backtracking --extra test -c requirements.txt -o requirements-test.txt +annotated-types==0.7.0 + # via + # -c requirements.txt + # pydantic +attrs==25.1.0 + # via + # -c requirements.txt + # frictionless + # jsonlines + # jsonschema + # referencing +certifi==2025.1.31 + # via + # -c requirements.txt + # requests +chardet==5.2.0 + # via + # -c requirements.txt + # frictionless +charset-normalizer==3.4.1 + # via + # -c requirements.txt + # requests +ckanapi==4.8 + # via + # -c requirements.txt + # hdx-python-api +click==8.1.8 + # via + # -c requirements.txt + # typer +coverage==7.6.12 + # via pytest-cov +defopt==6.4.0 + # via + # -c requirements.txt + # hdx-python-api +dnspython==2.7.0 + # via + # -c requirements.txt + # email-validator +docopt==0.6.2 + # via + # -c requirements.txt + # ckanapi + # num2words +docutils==0.21.2 + # via + # -c requirements.txt + # defopt +email-validator==2.2.0 + # via + # -c requirements.txt + # hdx-python-api +et-xmlfile==2.0.0 + # via + # -c requirements.txt + # openpyxl +frictionless==5.18.0 + # via + # -c requirements.txt + # hdx-python-utilities +hdx-python-api==6.3.8 + # via + # -c requirements.txt + # hdx-scraper-wfp-rainfall (pyproject.toml) +hdx-python-country==3.8.8 + # via + # -c requirements.txt + # hdx-python-api +hdx-python-utilities==3.8.4 + # via + # -c requirements.txt + # hdx-scraper-wfp-rainfall (pyproject.toml) + # hdx-python-api + # hdx-python-country +humanize==4.12.1 + # via + # -c requirements.txt + # frictionless +idna==3.10 + # via + # -c requirements.txt + # email-validator + # requests +ijson==3.3.0 + # via + # -c requirements.txt + # hdx-python-utilities +inflect==7.5.0 + # via + # -c requirements.txt + # quantulum3 +iniconfig==2.0.0 + # via pytest +isodate==0.7.2 + # via + # -c requirements.txt + # frictionless +jinja2==3.1.6 + # via + # -c requirements.txt + # frictionless +jsonlines==4.0.0 + # via + # -c requirements.txt + # hdx-python-utilities +jsonpath-ng==1.7.0 + # via + # -c requirements.txt + # libhxl +jsonschema==4.23.0 + # via + # -c requirements.txt + # frictionless + # tableschema-to-template +jsonschema-specifications==2024.10.1 + # via + # -c requirements.txt + # jsonschema +libhxl==5.2.2 + # via + # -c requirements.txt + # hdx-python-api + # hdx-python-country +loguru==0.7.3 + # via + # -c requirements.txt + # hdx-python-utilities +makefun==1.15.6 + # via + # -c requirements.txt + # hdx-python-api +markdown-it-py==3.0.0 + # via + # -c requirements.txt + # rich +marko==2.1.2 + # via + # -c requirements.txt + # frictionless +markupsafe==3.0.2 + # via + # -c requirements.txt + # jinja2 +mdurl==0.1.2 + # via + # -c requirements.txt + # markdown-it-py +more-itertools==10.6.0 + # via + # -c requirements.txt + # inflect +num2words==0.5.14 + # via + # -c requirements.txt + # quantulum3 +openpyxl==3.1.5 + # via + # -c requirements.txt + # hdx-python-utilities +packaging==24.2 + # via pytest +petl==1.7.15 + # via + # -c requirements.txt + # frictionless +pluggy==1.5.0 + # via pytest +ply==3.11 + # via + # -c requirements.txt + # jsonpath-ng + # libhxl +pockets==0.9.1 + # via + # -c requirements.txt + # sphinxcontrib-napoleon +pydantic==2.10.6 + # via + # -c requirements.txt + # frictionless +pydantic-core==2.27.2 + # via + # -c requirements.txt + # pydantic +pygments==2.19.1 + # via + # -c requirements.txt + # rich +pyphonetics==0.5.3 + # via + # -c requirements.txt + # hdx-python-utilities +pytest==8.3.5 + # via + # hdx-scraper-wfp-rainfall (pyproject.toml) + # pytest-cov +pytest-cov==6.0.0 + # via hdx-scraper-wfp-rainfall (pyproject.toml) +python-dateutil==2.9.0.post0 + # via + # -c requirements.txt + # frictionless + # hdx-python-utilities + # libhxl +python-io-wrapper==0.3.1 + # via + # -c requirements.txt + # libhxl +python-slugify==8.0.4 + # via + # -c requirements.txt + # ckanapi + # frictionless +pyyaml==6.0.2 + # via + # -c requirements.txt + # frictionless + # tableschema-to-template +quantulum3==0.9.2 + # via + # -c requirements.txt + # hdx-python-api +ratelimit==2.2.1 + # via + # -c requirements.txt + # hdx-python-utilities +referencing==0.36.2 + # via + # -c requirements.txt + # jsonschema + # jsonschema-specifications +requests==2.32.3 + # via + # -c requirements.txt + # ckanapi + # frictionless + # hdx-python-api + # libhxl + # requests-file +requests-file==2.1.0 + # via + # -c requirements.txt + # hdx-python-utilities +rfc3986==2.0.0 + # via + # -c requirements.txt + # frictionless +rich==13.9.4 + # via + # -c requirements.txt + # typer +rpds-py==0.23.1 + # via + # -c requirements.txt + # jsonschema + # referencing +ruamel-yaml==0.18.10 + # via + # -c requirements.txt + # hdx-python-utilities +ruamel-yaml-clib==0.2.12 + # via + # -c requirements.txt + # ruamel-yaml +setuptools==75.8.2 + # via + # -c requirements.txt + # ckanapi +shellingham==1.5.4 + # via + # -c requirements.txt + # typer +simpleeval==1.0.3 + # via + # -c requirements.txt + # frictionless +simplejson==3.20.1 + # via + # -c requirements.txt + # ckanapi +six==1.17.0 + # via + # -c requirements.txt + # ckanapi + # pockets + # python-dateutil + # sphinxcontrib-napoleon +sphinxcontrib-napoleon==0.7 + # via + # -c requirements.txt + # defopt +stringcase==1.2.0 + # via + # -c requirements.txt + # frictionless +structlog==25.1.0 + # via + # -c requirements.txt + # libhxl +tableschema-to-template==0.0.13 + # via + # -c requirements.txt + # hdx-python-utilities +tabulate==0.9.0 + # via + # -c requirements.txt + # frictionless +tenacity==9.0.0 + # via + # -c requirements.txt + # hdx-python-country +text-unidecode==1.3 + # via + # -c requirements.txt + # python-slugify +typeguard==4.4.2 + # via + # -c requirements.txt + # inflect +typer==0.15.2 + # via + # -c requirements.txt + # frictionless +typing-extensions==4.12.2 + # via + # -c requirements.txt + # frictionless + # pydantic + # pydantic-core + # referencing + # typeguard + # typer +unidecode==1.3.8 + # via + # -c requirements.txt + # libhxl + # pyphonetics +urllib3==2.3.0 + # via + # -c requirements.txt + # libhxl + # requests +validators==0.34.0 + # via + # -c requirements.txt + # frictionless +wheel==0.45.1 + # via + # -c requirements.txt + # libhxl +xlrd==2.0.1 + # via + # -c requirements.txt + # hdx-python-utilities +xlrd3==1.1.0 + # via + # -c requirements.txt + # libhxl +xlsx2csv==0.8.4 + # via + # -c requirements.txt + # hdx-python-utilities +xlsxwriter==3.2.2 + # via + # -c requirements.txt + # tableschema-to-template +xlwt==1.3.0 + # via + # -c requirements.txt + # hdx-python-utilities diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2401a35 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,213 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --resolver=backtracking -o requirements.txt +annotated-types==0.7.0 + # via pydantic +attrs==25.1.0 + # via + # frictionless + # jsonlines + # jsonschema + # referencing +certifi==2025.1.31 + # via requests +chardet==5.2.0 + # via frictionless +charset-normalizer==3.4.1 + # via requests +ckanapi==4.8 + # via hdx-python-api +click==8.1.8 + # via typer +defopt==6.4.0 + # via hdx-python-api +dnspython==2.7.0 + # via email-validator +docopt==0.6.2 + # via + # ckanapi + # num2words +docutils==0.21.2 + # via defopt +email-validator==2.2.0 + # via hdx-python-api +et-xmlfile==2.0.0 + # via openpyxl +frictionless==5.18.0 + # via hdx-python-utilities +hdx-python-api==6.3.8 + # via hdx-scraper-wfp-rainfall (pyproject.toml) +hdx-python-country==3.8.8 + # via hdx-python-api +hdx-python-utilities==3.8.4 + # via + # hdx-scraper-wfp-rainfall (pyproject.toml) + # hdx-python-api + # hdx-python-country +humanize==4.12.1 + # via frictionless +idna==3.10 + # via + # email-validator + # requests +ijson==3.3.0 + # via hdx-python-utilities +inflect==7.5.0 + # via quantulum3 +isodate==0.7.2 + # via frictionless +jinja2==3.1.6 + # via frictionless +jsonlines==4.0.0 + # via hdx-python-utilities +jsonpath-ng==1.7.0 + # via libhxl +jsonschema==4.23.0 + # via + # frictionless + # tableschema-to-template +jsonschema-specifications==2024.10.1 + # via jsonschema +libhxl==5.2.2 + # via + # hdx-python-api + # hdx-python-country +loguru==0.7.3 + # via hdx-python-utilities +makefun==1.15.6 + # via hdx-python-api +markdown-it-py==3.0.0 + # via rich +marko==2.1.2 + # via frictionless +markupsafe==3.0.2 + # via jinja2 +mdurl==0.1.2 + # via markdown-it-py +more-itertools==10.6.0 + # via inflect +num2words==0.5.14 + # via quantulum3 +openpyxl==3.1.5 + # via hdx-python-utilities +petl==1.7.15 + # via frictionless +ply==3.11 + # via + # jsonpath-ng + # libhxl +pockets==0.9.1 + # via sphinxcontrib-napoleon +pydantic==2.10.6 + # via frictionless +pydantic-core==2.27.2 + # via pydantic +pygments==2.19.1 + # via rich +pyphonetics==0.5.3 + # via hdx-python-utilities +python-dateutil==2.9.0.post0 + # via + # frictionless + # hdx-python-utilities + # libhxl +python-io-wrapper==0.3.1 + # via libhxl +python-slugify==8.0.4 + # via + # ckanapi + # frictionless +pyyaml==6.0.2 + # via + # frictionless + # tableschema-to-template +quantulum3==0.9.2 + # via hdx-python-api +ratelimit==2.2.1 + # via hdx-python-utilities +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications +requests==2.32.3 + # via + # ckanapi + # frictionless + # hdx-python-api + # libhxl + # requests-file +requests-file==2.1.0 + # via hdx-python-utilities +rfc3986==2.0.0 + # via frictionless +rich==13.9.4 + # via typer +rpds-py==0.23.1 + # via + # jsonschema + # referencing +ruamel-yaml==0.18.10 + # via hdx-python-utilities +ruamel-yaml-clib==0.2.12 + # via ruamel-yaml +setuptools==75.8.2 + # via ckanapi +shellingham==1.5.4 + # via typer +simpleeval==1.0.3 + # via frictionless +simplejson==3.20.1 + # via ckanapi +six==1.17.0 + # via + # ckanapi + # pockets + # python-dateutil + # sphinxcontrib-napoleon +sphinxcontrib-napoleon==0.7 + # via defopt +stringcase==1.2.0 + # via frictionless +structlog==25.1.0 + # via libhxl +tableschema-to-template==0.0.13 + # via hdx-python-utilities +tabulate==0.9.0 + # via frictionless +tenacity==9.0.0 + # via hdx-python-country +text-unidecode==1.3 + # via python-slugify +typeguard==4.4.2 + # via inflect +typer==0.15.2 + # via frictionless +typing-extensions==4.12.2 + # via + # frictionless + # pydantic + # pydantic-core + # referencing + # typeguard + # typer +unidecode==1.3.8 + # via + # libhxl + # pyphonetics +urllib3==2.3.0 + # via + # libhxl + # requests +validators==0.34.0 + # via frictionless +wheel==0.45.1 + # via libhxl +xlrd==2.0.1 + # via hdx-python-utilities +xlrd3==1.1.0 + # via libhxl +xlsx2csv==0.8.4 + # via hdx-python-utilities +xlsxwriter==3.2.2 + # via tableschema-to-template +xlwt==1.3.0 + # via hdx-python-utilities diff --git a/src/hdx/scraper/wfp_rainfall/__init__.py b/src/hdx/scraper/wfp_rainfall/__init__.py new file mode 100644 index 0000000..d986e5b --- /dev/null +++ b/src/hdx/scraper/wfp_rainfall/__init__.py @@ -0,0 +1 @@ +from ._version import version as __version__ # noqa: F401 diff --git a/src/hdx/scraper/wfp_rainfall/__main__.py b/src/hdx/scraper/wfp_rainfall/__main__.py new file mode 100755 index 0000000..d5cc063 --- /dev/null +++ b/src/hdx/scraper/wfp_rainfall/__main__.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +""" +Top level script. Calls other functions that generate datasets that this +script then creates in HDX. + +""" + +import logging +from os.path import dirname, expanduser, join + +from hdx.api.configuration import Configuration +from hdx.api.utilities.hdx_error_handler import HDXErrorHandler +from hdx.data.user import User +from hdx.facades.infer_arguments import facade +from hdx.utilities.downloader import Download +from hdx.utilities.path import temp_dir +from hdx.utilities.retriever import Retrieve + +from hdx.scraper.wfp_rainfall.wfp_rainfall import WFPRainfall + +logger = logging.getLogger(__name__) + +_USER_AGENT_LOOKUP = "hdx-scraper-wfp-rainfall" +_SAVED_DATA_DIR = "saved_data" # Keep in repo to avoid deletion in /tmp +_UPDATED_BY_SCRIPT = "HDX Scraper: WFP Rainfall" + + +def main( + save: bool = True, + use_saved: bool = False, + err_to_hdx: bool = False, +) -> None: + """Generate datasets and create them in HDX + + Args: + save (bool): Save downloaded data. Defaults to True. + use_saved (bool): Use saved data. Defaults to False. + err_to_hdx (bool): Whether to write any errors to HDX metadata. Defaults to False. + + Returns: + None + """ + logger.info(f"##### {_USER_AGENT_LOOKUP} ####") + configuration = Configuration.read() + if not User.check_current_user_organization_access("hdx-hapi", "create_dataset"): + raise PermissionError("API Token does not give access to HDX-HAPI organisation!") + + with HDXErrorHandler(write_to_hdx=err_to_hdx) as error_handler: + with temp_dir(folder=_USER_AGENT_LOOKUP) as temp_folder: + with Download() as downloader: + retriever = Retrieve( + downloader=downloader, + fallback_dir=temp_folder, + saved_dir=_SAVED_DATA_DIR, + temp_dir=temp_folder, + save=save, + use_saved=use_saved, + ) + + wfp_rainfall = WFPRainfall( + configuration, retriever, temp_folder, error_handler + ) + wfp_rainfall.download_data() + dataset = wfp_rainfall.generate_dataset() + dataset.update_from_yaml( + path=join(dirname(__file__), "config", "hdx_dataset_static.yaml") + ) + dataset.create_in_hdx( + remove_additional_resources=True, + match_resource_order=False, + hxl_update=False, + updated_by_script=_UPDATED_BY_SCRIPT, + ) + + +if __name__ == "__main__": + facade( + main, + hdx_site="dev", + user_agent_config_yaml=join(expanduser("~"), ".useragents.yaml"), + user_agent_lookup=_USER_AGENT_LOOKUP, + project_config_yaml=join( + dirname(__file__), "config", "project_configuration.yaml" + ), + ) diff --git a/src/hdx/scraper/wfp_rainfall/config/hdx_dataset_static.yaml b/src/hdx/scraper/wfp_rainfall/config/hdx_dataset_static.yaml new file mode 100755 index 0000000..a589718 --- /dev/null +++ b/src/hdx/scraper/wfp_rainfall/config/hdx_dataset_static.yaml @@ -0,0 +1,29 @@ +license_id: cc-by +methodology: Registry +caveats: This dataset is refreshed daily, but the source datasets have different update schedules. Please refer to the [source datasets](https://data.humdata.org/dataset/?dataseries_name=COD+-+Subnational+Population+Statistics) to verify their specific update frequency. +dataset_source: Climate Hazards Center UC Santa Barbara & WFP +package_creator: HDX Data Systems Team +private: False +maintainer: aa13de36-28c5-47a7-8d0b-6d7c754ba8c8 +owner_org: hdx-hapi +data_update_frequency: 1 +notes: | + This dataset contains data obtained from the + [HDX Humanitarian API](https://hapi.humdata.org/) (HDX HAPI), + which provides standardized humanitarian indicators designed + for seamless interoperability from multiple sources. + The data facilitates automated workflows and visualizations + to support humanitarian decision making. + For more information, please see the HDX HAPI + [landing page](https://data.humdata.org/hapi) + and + [documentation](https://hdx-hapi.readthedocs.io/en/latest/). + + Warnings typically indicate corrections have been made to + the data or show things to look out for. Rows with only warnings + are considered complete, and are made available via the API. + Errors usually mean that the data is incomplete or unusable. + Rows with any errors are not present in the API but are included + here for transparency. +subnational: "1" +dataset_preview: no_preview diff --git a/src/hdx/scraper/wfp_rainfall/config/project_configuration.yaml b/src/hdx/scraper/wfp_rainfall/config/project_configuration.yaml new file mode 100755 index 0000000..3131017 --- /dev/null +++ b/src/hdx/scraper/wfp_rainfall/config/project_configuration.yaml @@ -0,0 +1,32 @@ +# Collector specific configuration + +resource_info: + name: "Global Climate: Rainfall" + description: "Ranfall data from HDX HAPI, please see [the documentation](https://hdx-hapi.readthedocs.io/en/latest/data_usage_guides/climate/#rainfall) for more information" + +hxl_tags: + location_code: "#country+code" + has_hrp: "#meta+has_hrp" + in_gho: "#meta+in_gho" + provider_admin1_name: "#adm1+name+provider" + provider_admin2_name: "#adm2+name+provider" + admin1_code: "#adm1+code" + admin1_name: "#adm1+name" + admin2_code: "#adm2+code" + admin2_name: "#adm2+name" + admin_level: "#adm+level" + wfp_id: "#adm+code+wfp" + time_period: "#time+period" + rainfall: "#rainfall" + rainfall_long_term_average: "#rainfall+lta" + rainfall_anomaly_pct: "#rainfall+anomaly+pct" + reference_period_start: "#date+start" + reference_period_end: "#date+end" + dataset_hdx_id: "#meta+dataset_id" + resource_hdx_id: "#meta+resource_id" + warning: "#meta+warning" + error: "#meta+error" + +tags: + - "climate-weather" + - "hxl" diff --git a/src/hdx/scraper/wfp_rainfall/wfp_rainfall.py b/src/hdx/scraper/wfp_rainfall/wfp_rainfall.py new file mode 100755 index 0000000..b3f9fcc --- /dev/null +++ b/src/hdx/scraper/wfp_rainfall/wfp_rainfall.py @@ -0,0 +1,69 @@ +#!/usr/bin/python +"""wfp-rainfall scraper""" + +import logging + +from hdx.api.configuration import Configuration +from hdx.api.utilities.hdx_error_handler import HDXErrorHandler +from hdx.data.dataset import Dataset +from hdx.location.adminlevel import AdminLevel +from hdx.scraper.framework.utilities.hapi_admins import complete_admins +from hdx.utilities.dateparse import iso_string_from_datetime, parse_date_range +from hdx.utilities.retriever import Retrieve + +logger = logging.getLogger(__name__) + + +class WFPRainfall: + def __init__( + self, + configuration: Configuration, + retriever: Retrieve, + temp_dir: str, + error_handler: HDXErrorHandler, + ): + self._configuration = configuration + self._retriever = retriever + self._temp_dir = temp_dir + self._error_handler = error_handler + self._admins = [] + self.data = {} + self.dates = [] + + def get_pcodes(self) -> None: + for admin_level in [1, 2]: + admin = AdminLevel(admin_level=admin_level, retriever=self._retriever) + dataset = admin.get_libhxl_dataset(retriever=self._retriever) + admin.setup_from_libhxl_dataset(dataset) + admin.load_pcode_formats() + self._admins.append(admin) + + def download_data(self) -> None: + self.get_pcodes() + + def generate_dataset(self) -> Dataset: + dataset = Dataset( + { + "name": "hdx-hapi-rainfall", + "title": "HDX HAPI - Climate: Rainfall", + } + ) + dataset.add_tags(self._configuration["tags"]) + dataset.add_other_location("world") + start_date = min(self.dates) + end_date = max(self.dates) + dataset.set_time_period(start_date, end_date) + + hxl_tags = self._configuration["hxl_tags"] + headers = list(hxl_tags.keys()) + dataset.generate_resource_from_iterable( + headers, + self.data, + hxl_tags, + self._temp_dir, + "hdx_hapi_rainfall_global.csv", + self._configuration["resource_info"], + encoding="utf-8-sig", + ) + + return dataset diff --git a/tests/test_wfp_rainfall.py b/tests/test_wfp_rainfall.py new file mode 100644 index 0000000..e004883 --- /dev/null +++ b/tests/test_wfp_rainfall.py @@ -0,0 +1,50 @@ +from os.path import join + +import pytest +from hdx.api.configuration import Configuration +from hdx.utilities.downloader import Download +from hdx.utilities.path import temp_dir +from hdx.utilities.retriever import Retrieve +from hdx.utilities.useragent import UserAgent + + +class TestWFPRainfall: + @pytest.fixture(scope="function") + def configuration(self, config_dir): + UserAgent.set_global("test") + Configuration._create( + hdx_read_only=True, + hdx_site="prod", + project_config_yaml=join(config_dir, "project_configuration.yaml"), + ) + return Configuration.read() + + @pytest.fixture(scope="class") + def fixtures_dir(self): + return join("tests", "fixtures") + + @pytest.fixture(scope="class") + def input_dir(self, fixtures_dir): + return join(fixtures_dir, "input") + + @pytest.fixture(scope="class") + def config_dir(self, fixtures_dir): + return join("src", "hdx", "scraper", "wfp_rainfall", "config") + + def test_wfp_rainfall(self, configuration, fixtures_dir, input_dir, config_dir): + with temp_dir( + "Test_wfp_rainfall", + delete_on_success=True, + delete_on_failure=False, + ) as tempdir: + with Download(user_agent="test") as downloader: + retriever = Retrieve( + downloader=downloader, + fallback_dir=tempdir, + saved_dir=input_dir, + temp_dir=tempdir, + save=False, + use_saved=True, + ) + + dataset.update_from_yaml(path=join(config_dir, "hdx_dataset_static.yaml"))