From 31e7741d73f027b86306320681f5e3d7c374b9a1 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Sat, 16 Mar 2024 16:30:13 -0500 Subject: [PATCH] feat: add sqlContext (#185) * feat: add sqlContext * pr feedback --- Cargo.toml | 133 ++++++++++--------- __tests__/sql.test.ts | 81 +++++++++++ polars/index.ts | 16 +++ polars/sql.ts | 302 ++++++++++++++++++++++++++++++++++++++++++ src/lazy/dataframe.rs | 2 +- src/lib.rs | 1 + src/sql.rs | 44 ++++++ 7 files changed, 512 insertions(+), 67 deletions(-) create mode 100644 __tests__/sql.test.ts create mode 100644 polars/sql.ts create mode 100644 src/sql.rs diff --git a/Cargo.toml b/Cargo.toml index 4a71fb7ef..16c078c69 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,11 +15,11 @@ crate-type = ["cdylib", "lib"] [dependencies] ahash = "0.8.7" bincode = "1.3.3" -napi = { version = "2.14.2", default-features = false, features = [ - "napi8", - "serde-json", +napi = { version = "2.16.0", default-features = false, features = [ + "napi8", + "serde-json", ] } -napi-derive = { version = "2.14.6", default-features = false } +napi-derive = { version = "2.16.0", default-features = false } polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "3cf4897e679b056d17a235d48867035265d43cdc", default-features = false } polars-io = { git = "https://github.com/pola-rs/polars.git", rev = "3cf4897e679b056d17a235d48867035265d43cdc", default-features = false } polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "3cf4897e679b056d17a235d48867035265d43cdc", default-features = false } @@ -30,68 +30,69 @@ either = "1.9" [dependencies.polars] features = [ - "binary_encoding", - "rolling_window", - "json", - "dynamic_group_by", - "zip_with", - "simd", - "lazy", - "strings", - "temporal", - "random", - "object", - "fmt", - "performant", - "dtype-full", - "rows", - "round_series", - "is_unique", - "is_in", - "is_first_distinct", - "asof_join", - "cross_join", - "dot_product", - "concat_str", - "row_hash", - "reinterpret", - "mode", - "extract_jsonpath", - "cum_agg", - "rolling_window", - "repeat_by", - "interpolate", - "ewma", - "rank", - "propagate_nans", - "diff", - "pct_change", - "moment", - "diagonal_concat", - "abs", - "dot_diagram", - "dataframe_arithmetic", - "json", - "string_encoding", - "product", - "ndarray", - "unique_counts", - "log", - "serde-lazy", - "partition_by", - "pivot", - "semi_anti_join", - "parquet", - "to_dummies", - "ipc", - "avro", - "list_eval", - "arg_where", - "timezones", - "peaks", - "string_pad", - "cov", - "group_by_list", + "binary_encoding", + "rolling_window", + "json", + "dynamic_group_by", + "zip_with", + "simd", + "lazy", + "strings", + "temporal", + "random", + "object", + "fmt", + "performant", + "dtype-full", + "rows", + "round_series", + "is_unique", + "is_in", + "is_first_distinct", + "asof_join", + "cross_join", + "dot_product", + "concat_str", + "row_hash", + "reinterpret", + "mode", + "extract_jsonpath", + "cum_agg", + "rolling_window", + "repeat_by", + "interpolate", + "ewma", + "rank", + "propagate_nans", + "diff", + "pct_change", + "moment", + "diagonal_concat", + "abs", + "dot_diagram", + "dataframe_arithmetic", + "json", + "string_encoding", + "product", + "ndarray", + "unique_counts", + "log", + "serde-lazy", + "partition_by", + "pivot", + "semi_anti_join", + "parquet", + "to_dummies", + "ipc", + "avro", + "list_eval", + "arg_where", + "timezones", + "peaks", + "string_pad", + "cov", + "group_by_list", + "sql", ] git = "https://github.com/pola-rs/polars.git" rev = "3cf4897e679b056d17a235d48867035265d43cdc" diff --git a/__tests__/sql.test.ts b/__tests__/sql.test.ts new file mode 100644 index 000000000..a8e9e8822 --- /dev/null +++ b/__tests__/sql.test.ts @@ -0,0 +1,81 @@ +import pl from "@polars"; +describe("sql", () => { + test("execute", () => { + const df = pl.DataFrame({ + values: [ + ["aa", "bb"], + [null, "cc"], + ["dd", null], + ], + }); + + const ctx = pl.SQLContext({ df }); + const actual = ctx.execute("SELECT * FROM df").collectSync(); + + expect(actual).toFrameEqual(df); + const actual2 = ctx.execute("SELECT * FROM df", { eager: true }); + expect(actual2).toFrameEqual(df); + }); + + test("register and query dataframe", () => { + const df = pl.DataFrame({ hello: ["world"] }); + const ctx = pl.SQLContext(); + ctx.register("frame_data", df); + const actual = ctx.execute("SELECT * FROM frame_data", { eager: true }); + + const expected = pl.DataFrame({ hello: ["world"] }); + + expect(actual).toFrameEqual(expected); + ctx.register("null_frame", null); + + const actual2 = ctx.execute("SELECT * FROM null_frame", { eager: true }); + const expected2 = pl.DataFrame(); + expect(actual2).toFrameEqual(expected2); + }); + test("register many", () => { + const lf1 = pl.DataFrame({ a: [1, 2, 3], b: ["m", "n", "o"] }); + const lf2 = pl.DataFrame({ a: [2, 3, 4], c: ["p", "q", "r"] }); + + // Register multiple DataFrames at once + const ctx = pl.SQLContext().registerMany({ tbl1: lf1, tbl2: lf2 }); + const tables = ctx.tables(); + + expect(tables).toEqual(expect.arrayContaining(["tbl1", "tbl2"])); + }); + test("inspect", () => { + const df = pl.DataFrame({ + a: [1, 2, 3], + b: ["m", "n", "o"], + }); + + const ctx = pl.SQLContext({ df }); + const actual = ctx[Symbol.for("nodejs.util.inspect.custom")](); + + const expected = "SQLContext: {df}"; + + expect(actual).toEqual(expected); + }); + test("constructor with LazyFrames", () => { + const lf1 = pl.DataFrame({ a: [1, 2, 3], b: ["m", "n", "o"] }).lazy(); + const lf2 = pl.DataFrame({ a: [2, 3, 4], c: ["p", "q", "r"] }).lazy(); + + const ctx = pl.SQLContext({ tbl1: lf1, tbl2: lf2 }); + const tables = ctx.tables(); + expect(tables).toEqual(expect.arrayContaining(["tbl1", "tbl2"])); + }); + test("unregister", () => { + const df = pl.DataFrame({ hello: ["world"] }); + const df2 = pl.DataFrame({ hello: ["world"] }); + const df3 = pl.DataFrame({ hello: ["world"] }); + const ctx = pl.SQLContext({ df, df2, df3 }); + + ctx.unregister("df"); + + const tables = ctx.tables(); + expect(tables).toEqual(["df2", "df3"]); + + ctx.unregister(["df2", "df3"]); + const tables2 = ctx.tables(); + expect(tables2).toEqual([]); + }); +}); diff --git a/polars/index.ts b/polars/index.ts index d30a9bb16..19dd8f1e5 100644 --- a/polars/index.ts +++ b/polars/index.ts @@ -17,6 +17,9 @@ export * from "./lazy/dataframe"; export * from "./lazy"; import * as lazy from "./lazy"; export * from "./types"; +import * as sql from "./sql"; +export type { SQLContext } from "./sql"; + export type { GroupBy } from "./groupby"; export namespace pl { export import Expr = lazy.Expr; @@ -109,6 +112,19 @@ export namespace pl { export import list = lazy.list; export import when = lazy.when; export const version = pli.version(); + + /** + * Run SQL queries against DataFrame/LazyFrame data. + * + * @warning This functionality is considered **unstable**, although it is close to being + * considered stable. It may be changed at any point without it being considered + * a breaking change. + */ + export function SQLContext( + frames?: Record, + ): sql.SQLContext { + return new sql.SQLContext(frames); + } } // eslint-disable-next-line no-undef export default pl; diff --git a/polars/sql.ts b/polars/sql.ts new file mode 100644 index 000000000..fb9f323ea --- /dev/null +++ b/polars/sql.ts @@ -0,0 +1,302 @@ +import { DataFrame, type LazyDataFrame, _LazyDataFrame } from "."; +import pli from "./internals/polars_internal"; +const INSPECT = Symbol.for("nodejs.util.inspect.custom"); + +/** + * Run SQL queries against DataFrame/LazyFrame data. + * + * @warning This functionality is considered **unstable**, although it is close to being + * considered stable. It may be changed at any point without it being considered + * a breaking change. + */ +export interface SQLContext { + /** + * Parse the given SQL query and execute it against the registered frame data. + * + * @param query - A valid string SQL query. + * @param eager - Apply the query eagerly, returning `DataFrame` instead of `LazyFrame`. + * If unset, the value of the init-time parameter "eager_execution" will be + * used. (Note that the query itself is always executed in lazy-mode; this + * parameter only impacts the type of the returned frame). + * + * @example + * Declare frame data and register with a SQLContext: + * + * ```ts + * const df = pl.DataFrame({ + * data: [ + * ("The Godfather", 1972, 6_000_000, 134_821_952, 9.2), + * ("The Dark Knight", 2008, 185_000_000, 533_316_061, 9.0), + * ("Schindler's List", 1993, 22_000_000, 96_067_179, 8.9), + * ("Pulp Fiction", 1994, 8_000_000, 107_930_000, 8.9), + * ("The Shawshank Redemption", 1994, 25_000_000, 28_341_469, 9.3), + * ], + * schema: ["title", "release_year", "budget", "gross", "imdb_score"], + * }); + * const ctx = pl.SQLContext({ films: df }); + * ``` + * + * Execute a SQL query against the registered frame data: + * + * ```ts + * const result = ctx.execute(` + * SELECT title, release_year, imdb_score + * FROM films + * WHERE release_year > 1990 + * ORDER BY imdb_score DESC + * `, { eager: true }); + * console.log(result); + * // shape: (4, 3) + * // ┌──────────────────────────┬──────────────┬────────────┐ + * // │ title ┆ release_year ┆ imdb_score │ + * // │ --- ┆ --- ┆ --- │ + * // ╞══════════════════════════╪══════════════╪════════════╡ + * // │ The Shawshank Redemption ┆ 1994 ┆ 9.3 │ + * // │ The Dark Knight ┆ 2008 ┆ 9.0 │ + * // │ Schindler's List ┆ 1993 ┆ 8.9 │ + * // │ Pulp Fiction ┆ 1994 ┆ 8.9 │ + * // └──────────────────────────┴──────────────┴────────────┘ + * ``` + * + * Execute a GROUP BY query: + * + * ```ts + * ctx.execute(` + * SELECT + * MAX(release_year / 10) * 10 AS decade, + * SUM(gross) AS total_gross, + * COUNT(title) AS n_films, + * FROM films + * GROUP BY (release_year / 10) -- decade + * ORDER BY total_gross DESC + * `, { eager: true }); + * // shape: (3, 3) + * // ┌────────┬─────────────┬─────────┐ + * // │ decade ┆ total_gross ┆ n_films │ + * // │ --- ┆ --- ┆ --- │ + * // ╞════════╪═════════════╪═════════╡ + * // │ 2000 ┆ 533316061 ┆ 1 │ + * // │ 1990 ┆ 232338648 ┆ 3 │ + * // │ 1970 ┆ 134821952 ┆ 1 │ + * // └────────┴─────────────┴─────────┘ + * ``` + */ + execute(query: string): LazyDataFrame; + execute(query: string, { eager }: { eager: true }): DataFrame; + execute(query: string, { eager }: { eager: false }): LazyDataFrame; + + /** + * Register a single frame as a table, using the given name. + * + * Parameters + * ---------- + * name : string + * Name of the table. + * frame : DataFrame | LazyFrame | null + * Eager/lazy frame to associate with this table name. + * + * See Also + * -------- + * register_globals + * register_many + * unregister + * + * Examples + * -------- + * const df = pl.DataFrame({"hello": ["world"]}); + * const ctx = pl.SQLContext(); + * ctx.register("frame_data", df).execute("SELECT * FROM frame_data").collect(); + * returns: shape: (1, 1) + * ┌───────┐ + * │ hello │ + * │ --- │ + * │ str │ + * ╞═══════╡ + * │ world │ + * └───────┘ + */ + register(name: string, frame: DataFrame | LazyDataFrame | null): SQLContext; + + /** + * Register multiple DataFrames as tables, using the associated names. + * + * @param {Object} frames An `{name: df, ...}` mapping. + * + * @returns {SQLContext} The SQLContext with registered DataFrames. + * + * @see register + * @see unregister + * + * @example + * const lf1 = pl.DataFrame({"a": [1, 2, 3], "b": ["m", "n", "o"]}); + * const lf2 = pl.DataFrame({"a": [2, 3, 4], "c": ["p", "q", "r"]}); + * + * // Register multiple DataFrames at once + * const ctx = pl.SQLContext().registerMany({"tbl1": lf1, "tbl2": lf2}); + * console.log(ctx.tables()); + * // Output: ['tbl1', 'tbl2'] + */ + registerMany(frames: Record): SQLContext; + /** + * Unregister one or more eager/lazy frames by name. + * + * @param names - Names of the tables to unregister. + * + * @remarks + * You can also control table registration lifetime by using `SQLContext` as a + * context manager; this can often be more useful when such control is wanted. + * + * Frames registered in-scope are automatically unregistered on scope-exit. Note + * that frames registered on construction will persist through subsequent scopes. + * + * @example + * ```ts + * const df0 = pl.DataFrame({"colx": [0, 1, 2]}); + * const df1 = pl.DataFrame({"colx": [1, 2, 3]}); + * const df2 = pl.DataFrame({"colx": [2, 3, 4]}); + * + * // Register one frame at construction time, and the other two in-scope + * const ctx = pl.SQLContext({ tbl0: df0 }); + * ctx.register("tbl1", df1); + * ctx.register("tbl2", df2); + * console.log(ctx.tables()); // Output: ['tbl0', 'tbl1', 'tbl2'] + * + * // After scope exit, none of the tables registered in-scope remain + * ``` + * + * @see register + * @see register_globals + * @see register_many + * + * @example + * ```ts + * const df0 = pl.DataFrame({"ints": [9, 8, 7, 6, 5]}); + * const lf1 = pl.LazyDataFrame({"text": ["a", "b", "c"]}); + * const lf2 = pl.LazyDataFrame({"misc": ["testing1234"]}); + * + * // Register with a SQLContext object + * const ctx = pl.SQLContext({ test1: df0, test2: lf1, test3: lf2 }); + * console.log(ctx.tables()); // Output: ['test1', 'test2', 'test3'] + * + * // Unregister one or more of the tables + * ctx.unregister(["test1", "test3"]); + * console.log(ctx.tables()); // Output: ['test2'] + * ctx.unregister("test2"); + * console.log(ctx.tables()); // Output: [] + * ``` + */ + unregister(names: string | string[]): SQLContext; + + /** + * Returns a list of the registered table names. + * + * @remarks + * The `tables` method will return the same values as the "SHOW TABLES" SQL statement, but as a list instead of a frame. + * + * Executing as SQL: + * ```ts + * const frame_data = pl.DataFrame({"hello": ["world"]}); + * const ctx = pl.SQLContext({ hello_world: frame_data }); + * console.log(ctx.execute("SHOW TABLES", { eager: true })); + * // shape: (1, 1) + * // ┌─────────────┐ + * // │ name │ + * // │ --- │ + * // │ str │ + * // ╞═════════════╡ + * // │ hello_world │ + * // └─────────────┘ + * ``` + * + * Calling the method: + * ```ts + * console.log(ctx.tables()); + * // ['hello_world'] + * ``` + * + * @example + * ```ts + * const df1 = pl.DataFrame({"hello": ["world"]}); + * const df2 = pl.DataFrame({"foo": ["bar", "baz"]}); + * const ctx = pl.SQLContext({ hello_data: df1, foo_bar: df2 }); + * console.log(ctx.tables()); + * // ['foo_bar', 'hello_data'] + * ``` + * + * @returns {string[]} An array of the registered table names. + */ + tables(): string[]; +} + +export class SQLContext implements SQLContext { + #ctx: any; // native SQLContext + [INSPECT](): string { + return `SQLContext: {${this.#ctx.getTables().join(", ")}}`; + } + + constructor(frames?: Record) { + this.#ctx = new pli.SqlContext(); + for (const [name, frame] of Object.entries(frames ?? {})) { + if (DataFrame.isDataFrame(frame)) { + this.#ctx.register(name, frame._df.lazy()); + } else { + this.#ctx.register(name, frame._ldf); + } + } + } + + execute(query: string): LazyDataFrame; + execute(query: string, { eager }: { eager: true }): DataFrame; + execute(query: string, { eager }: { eager: false }): LazyDataFrame; + execute( + this: SQLContext, + query: string, + { eager } = { eager: false }, + ): LazyDataFrame | DataFrame { + const lf_ = this.#ctx.execute(query); + const lf = _LazyDataFrame(lf_); + if (eager) { + return lf.collectSync(); + } + + return lf; + } + + register( + this: SQLContext, + name: string, + frame: DataFrame | LazyDataFrame | null, + ): SQLContext { + if (frame == null) { + frame = DataFrame().lazy(); + } else if (DataFrame.isDataFrame(frame)) { + frame = frame.lazy(); + } + this.#ctx.register(name, frame._ldf); + return this; + } + + registerMany( + this: SQLContext, + frames: Record, + ): SQLContext { + for (const [name, frame] of Object.entries(frames)) { + this.register(name, frame); + } + return this; + } + + unregister(names: string | string[]): SQLContext { + if (typeof names === "string") { + names = [names]; + } + for (const name of names) { + this.#ctx.unregister(name); + } + return this; + } + + tables(): string[] { + return this.#ctx.getTables(); + } +} diff --git a/src/lazy/dataframe.rs b/src/lazy/dataframe.rs index 0915c562d..7957ad41b 100644 --- a/src/lazy/dataframe.rs +++ b/src/lazy/dataframe.rs @@ -19,7 +19,7 @@ pub struct JsLazyGroupBy { #[repr(transparent)] #[derive(Clone)] pub struct JsLazyFrame { - ldf: LazyFrame, + pub(crate) ldf: LazyFrame, } impl From for JsLazyFrame { fn from(ldf: LazyFrame) -> Self { diff --git a/src/lib.rs b/src/lib.rs index 011a8f829..9671a06e7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,7 @@ pub mod prelude; pub mod series; pub mod set; pub mod utils; +pub mod sql; pub use polars_core; diff --git a/src/sql.rs b/src/sql.rs new file mode 100644 index 000000000..b7384849b --- /dev/null +++ b/src/sql.rs @@ -0,0 +1,44 @@ +use crate::{export::JsLazyFrame, prelude::*}; +use polars::sql::SQLContext; + +#[napi(js_name = "SqlContext")] +#[repr(transparent)] +#[derive(Clone)] +pub struct JsSQLContext { + context: SQLContext, +} + +#[napi] +impl JsSQLContext { + #[napi(constructor)] + #[allow(clippy::new_without_default)] + pub fn new() -> JsSQLContext { + JsSQLContext { + context: SQLContext::new(), + } + } + + #[napi(catch_unwind)] + pub fn execute(&mut self, query: String) -> JsResult { + Ok(self + .context + .execute(&query) + .map_err(JsPolarsErr::from)? + .into()) + } + + #[napi] + pub fn get_tables(&self) -> Vec { + self.context.get_tables() + } + + #[napi] + pub fn register(&mut self, name: String, lf: &JsLazyFrame) { + self.context.register(&name, lf.clone().ldf) + } + + #[napi] + pub fn unregister(&mut self, name: String) { + self.context.unregister(&name) + } +}