Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: duckdb inclusion, again #648

Merged
merged 7 commits into from
Feb 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ concurrency:
env:
CARGO_TERM_COLOR: always
CARGO_TERM_VERBOSE: true
duckdb-version: "1.2.0"

jobs:
test-core:
Expand All @@ -31,7 +32,7 @@ jobs:
- name: Test
run: cargo test -p stac --all-features
check-features-core:
name: Check stac features
name: Check all features
runs-on: ubuntu-latest
defaults:
run:
Expand All @@ -46,26 +47,33 @@ jobs:
test:
name: Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
- name: Format
run: cargo fmt --check
- name: Clippy
run: cargo clippy --workspace -- -D warnings
- name: Test
run: cargo test --all-features
test-with-duckdb:
name: Test w/ DuckDB
runs-on: ubuntu-latest
env:
DUCKDB_LIB_DIR: /opt/duckdb
DUCKDB_INCLUDE_DIR: /opt/duckdb
LD_LIBRARY_PATH: /opt/duckdb
steps:
- uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
- uses: astral-sh/setup-uv@v5
- name: Get DuckDB
run: |
wget https://github.com/duckdb/duckdb/releases/download/v1.2.0/libduckdb-linux-amd64.zip
wget https://github.com/duckdb/duckdb/releases/download/v${{ env.duckdb-version }}/libduckdb-linux-amd64.zip
unzip libduckdb-linux-amd64.zip -d /opt/duckdb
- name: Format
run: cargo fmt --check
- name: Clippy
run: cargo clippy --workspace --no-default-features -- -D warnings
- name: Build # need to build first to get the executable for CLI tests
run: cargo build --no-default-features
run: cargo build -p stac-duckdb -p stac-cli -F duckdb
- name: Test
run: cargo test --no-default-features
run: cargo test -p stac-duckdb -p stac-cli -F duckdb
- name: Validate stac-server
run: uv run --group stac-api-validator scripts/validate-stac-server
- name: Validate stac-geoparquet
Expand All @@ -80,7 +88,6 @@ jobs:
- v0.9.1
env:
DUCKDB_LIB_DIR: /opt/duckdb
DUCKDB_INCLUDE_DIR: /opt/duckdb
LD_LIBRARY_PATH: /opt/duckdb
services:
pgstac:
Expand All @@ -98,10 +105,6 @@ jobs:
- uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
- uses: astral-sh/setup-uv@v5
- name: Get DuckDB
run: |
wget https://github.com/duckdb/duckdb/releases/download/v1.1.3/libduckdb-linux-amd64.zip
unzip libduckdb-linux-amd64.zip -d /opt/duckdb
- name: Test
run: cargo test -p pgstac --all-features
- name: Validate
Expand All @@ -114,7 +117,7 @@ jobs:
- uses: dtolnay/rust-toolchain@nightly
- uses: Swatinem/rust-cache@v2
- name: Check
run: cargo check --workspace --all-features
run: cargo check --workspace
msrv:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@ members = [
]
default-members = [
"crates/api",
"crates/cli",
"crates/core",
"crates/derive",
"crates/duckdb",
"crates/extensions",
"crates/server",
]
Expand Down
9 changes: 4 additions & 5 deletions crates/cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,15 @@ categories.workspace = true
rust-version.workspace = true

[features]
default = ["duckdb-bundled", "pgstac"]
default = []
pgstac = ["stac-server/pgstac"]
duckdb-bundled = ["duckdb/bundled"]
duckdb = ["dep:stac-duckdb"]
duckdb-bundled = ["duckdb", "stac-duckdb/bundled"]

[dependencies]
anyhow.workspace = true
axum.workspace = true
clap = { workspace = true, features = ["derive"] }
duckdb = { workspace = true }
libduckdb-sys = { workspace = true }
serde_json.workspace = true
stac = { workspace = true, features = [
"geoparquet-compression",
Expand All @@ -30,7 +29,7 @@ stac = { workspace = true, features = [
"validate",
] }
stac-api = { workspace = true, features = ["client"] }
stac-duckdb.workspace = true
stac-duckdb = { workspace = true, optional = true }
stac-server = { workspace = true, features = ["axum"] }
tokio = { workspace = true, features = [
"macros",
Expand Down
24 changes: 10 additions & 14 deletions crates/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ Command Line Interface (CLI) for [STAC](https://stacspec.org/), named `stacrs`.
## Installation

```sh
cargo install stac-cli
cargo install stac-cli -F duckdb # to use libduckdb on your system
# or
cargo install stac-cli -F duckdb-bundled # to build libduckdb on install (slow)
```

Then:
Expand Down Expand Up @@ -46,7 +48,7 @@ $ stacrs validate item.json

**stacrs** provides the following subcommands:

- `stacrs search`: searches STAC APIs and geoparquet files
- `stacrs search`: searches STAC APIs and, if the `duckdb` feature is enabled, geoparquet files
- `stacrs serve`: serves a STAC API
- `stacrs translate`: converts STAC from one format to another
- `stacrs validate`: validates a STAC value
Expand All @@ -55,20 +57,14 @@ Use the `--help` flag to see all available options for the CLI and the subcomman

## Features

This crate has two features:
This crate has three features:

- `pgstac`: enable a [pgstac](https://github.com/stac-utils/pgstac) backend for `stacrs serve` (enabled by default)
- `duckdb-bundled`: bundle DuckDB by building it from source, instead of using a local installation (enabled by default)
- `pgstac`: enable a [pgstac](https://github.com/stac-utils/pgstac) backend for `stacrs serve`
- `duckdb`: build with DuckDB support, which enables searching [stac-geoparquet](https://github.com/stac-utils/stac-geoparquet) (requires DuckDB to be present on your system)
- `duckdb-bundled`: bundle DuckDB by building it from source, instead of using a local installation (does _not_ require DuckDB to be present on your system)

### DuckDB

If you have DuckDB installed locally and want to use that shared library (instead of building it when you install), disable the `duckdb-bundled` feature:

```shell
cargo install stac-cli --no-default-features -F pgstac
```

This can dramatically speed up install time.
> [!TIP]
> If you're using the `duckdb` feature, set `DUCKDB_LIB_DIR` to the directory containing your **libduckdb**. If you're on macos and using [Homebrew](https://brew.sh/), this might be `export DUCKDB_LIB_DIR=/opt/homebrew/lib`

## Other info

Expand Down
11 changes: 8 additions & 3 deletions crates/cli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

use anyhow::{anyhow, Error, Result};
use clap::{Parser, Subcommand};
use duckdb as _;
use libduckdb_sys as _;
use stac::{geoparquet::Compression, Collection, Format, Item, Links, Migrate, Validate};
use stac_api::{GetItems, GetSearch, Search};
use stac_server::Backend;
Expand Down Expand Up @@ -309,7 +307,14 @@ impl Stacrs {
};
let search: Search = get_search.try_into()?;
let item_collection = if use_duckdb {
stac_duckdb::search(href, search, *max_items)?
#[cfg(feature = "duckdb")]
{
stac_duckdb::search(href, search, *max_items)?
}
#[cfg(not(feature = "duckdb"))]
return Err(anyhow!(
"the `duckdb` feature is not enabled, cannot search stac-geoparquet"
));
} else {
stac_api::client::search(href, search, *max_items).await?
};
Expand Down
4 changes: 4 additions & 0 deletions crates/duckdb/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ license.workspace = true
categories.workspace = true
rust-version.workspace = true

[features]
default = []
bundled = ["duckdb/bundled"]

[dependencies]
arrow.workspace = true
chrono.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion crates/duckdb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ If you want to build the DuckDB library as a part of this (or a downstream's) cr
E.g. to test this crate if you don't have DuckDB locally:

```shell
cargo test -p stac-duckdb -F duckdb/bundled
cargo test -p stac-duckdb -F bundled
```

See [the duckdb-rs docs](https://github.com/duckdb/duckdb-rs?tab=readme-ov-file#notes-on-building-duckdb-and-libduckdb-sys) for more.
Expand Down
14 changes: 8 additions & 6 deletions docs/cli/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,22 @@ description: The stac-rs command-line interface (CLI), stacrs
The **stac-rs** command-line interface can be installed two ways.
If you have Rust, use `cargo`:

```shell
cargo install stac-cli
```sh
cargo install stac-cli -F duckdb # to use libduckdb on your system
# or
cargo install stac-cli -F duckdb-bundled # to build libduckdb on install (slow)
```

If you have Python, use `pip`:
The CLI is called **stacrs**:

```shell
python -m pip install stacrs
stacrs --help
```

The CLI is called **stacrs**:
If you don't have DuckDB on your system, you can also use the Python wheel, which includes **libduckdb**:

```shell
stacrs --help
python -m pip install stacrs
```

For examples of using the CLI, check out the slides from [@gadomski's](https://github.com/gadomski/) 2024 FOSS4G-NA presentation [here](https://www.gadom.ski/2024-09-FOSS4G-NA-stac-rs/).
2 changes: 1 addition & 1 deletion scripts/format
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
set -e

cargo fmt
cargo clippy --workspace --all-features --fix
cargo clippy --workspace --fix
2 changes: 1 addition & 1 deletion scripts/lint
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
set -e

cargo fmt --check
cargo clippy --workspace --all-features -- -D warnings
cargo clippy --workspace -- -D warnings
52 changes: 42 additions & 10 deletions scripts/validate-stac-geoparquet
Original file line number Diff line number Diff line change
@@ -1,29 +1,35 @@
#!/usr/bin/env python

import json
import sys
import shutil
import subprocess
import sys
import tempfile
from typing import Any
from deepdiff import DeepDiff
from pathlib import Path
from typing import Any

import pyarrow
import pyarrow.parquet
import stac_geoparquet.arrow
import pyarrow
from deepdiff import DeepDiff

root = Path(__file__).parents[1]
path = root / "spec-examples" / "v1.1.0" / "extended-item.json"
directory = tempfile.mkdtemp()
parquet_path = Path(directory) / "extended-item.parquet"


def clean_item(item: dict[str, Any]) -> None:
if "type" not in item:
item["type"] = "Feature"
if item["geometry"]["type"] == "MultiPolygon" and len(item["geometry"]["coordinates"]) == 1:
if (
item["geometry"]["type"] == "MultiPolygon"
and len(item["geometry"]["coordinates"]) == 1
):
item["geometry"]["type"] = "Polygon"
item["geometry"]["coordinates"] = item["geometry"]["coordinates"][0]


def clean_report(report: dict[str, Any]) -> dict[str, Any]:
"""We expect datetime values to be changed in the report."""
if report.get("values_changed"):
Expand All @@ -36,10 +42,22 @@ def clean_report(report: dict[str, Any]) -> dict[str, Any]:
del report["values_changed"]
return report


try:
# Writing
subprocess.check_call(
["cargo", "run", "--no-default-features", "-F", "geoparquet", "--", "translate", path, parquet_path]
[
"cargo",
"run",
"-p",
"stac-cli",
"-F",
"duckdb",
"--",
"translate",
path,
parquet_path,
]
)
table = pyarrow.parquet.read_table(parquet_path)
after = next(stac_geoparquet.arrow.stac_table_to_items(table))
Expand All @@ -57,11 +75,25 @@ try:
# Reading
table = stac_geoparquet.arrow.parse_stac_items_to_arrow([before])
stac_geoparquet.arrow.to_parquet(table, parquet_path)
item_collection = json.loads(subprocess.check_output(
["cargo", "run", "--no-default-features", "-F", "geoparquet", "--", "translate", parquet_path]
))
item_collection = json.loads(
subprocess.check_output(
[
"cargo",
"run",
"-p",
"stac-cli",
"-F",
"duckdb",
"--",
"translate",
parquet_path,
]
)
)
assert len(item_collection["features"]) == 1
clean_item(item_collection["features"][0]) # stac-geoparquet writes as a multi-polygon
clean_item(
item_collection["features"][0]
) # stac-geoparquet writes as a multi-polygon
report = DeepDiff(before, item_collection["features"][0]).to_dict()
report = clean_report(report)
if report:
Expand Down