Skip to content

Commit

Permalink
Allowing expr in contains (#312)
Browse files Browse the repository at this point in the history
Allowing expr in contains to match Python syntax to close #311

Co-authored-by: dchrostowski <dchrostowski@medallia.com>
  • Loading branch information
Bidek56 and dchrostowski authored Feb 24, 2025
1 parent bcff890 commit 9812d84
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 27 deletions.
28 changes: 26 additions & 2 deletions __tests__/expr.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1034,7 +1034,19 @@ describe("expr.str", () => {
expect(actual).toFrameEqual(expected);
expect(seriesActual).toSeriesEqual(expected.getColumn("isLinux"));
});

test("contains:expr", () => {
const df = pl.DataFrame({
os: ["linux-kali", "linux-debian", "windows-vista"],
name: ["kali", "debian", "macos"],
});
const expected = df.withColumn(
pl.Series("isLinux", [true, true, false], pl.Bool),
);
const actual = df.withColumn(
col("os").str.contains(pl.col("name")).as("isLinux"),
);
expect(actual).toFrameEqual(expected);
});
test("contains:regex", () => {
const df = pl.DataFrame({
a: ["Foo", "foo", "FoO"],
Expand All @@ -1050,7 +1062,19 @@ describe("expr.str", () => {
expect(actual).toFrameEqual(expected);
expect(seriesActual).toSeriesEqual(expected.getColumn("contains"));
});

test("contains:regex2", () => {
const df = pl.DataFrame({ txt: ["Crab", "cat and dog", "rab$bit", null] });
const actual = df.select(
pl.col("txt"),
pl.col("txt").str.contains("cat|bit").alias("regex"),
pl.col("txt").str.contains("rab$", true).alias("literal"),
);
const expected = df.withColumns(
pl.Series("regex", [false, true, true, null], pl.Bool),
pl.Series("literal", [false, false, true, null], pl.Bool),
);
expect(actual).toFrameEqual(expected);
});
test("split", () => {
const df = pl.DataFrame({ a: ["ab,cd", "e,fg", "h"] });
const expected = pl.DataFrame({
Expand Down
38 changes: 34 additions & 4 deletions polars/lazy/expr/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,38 @@ export interface StringNamespace extends StringFunctions<Expr> {
* ```
*/
concat(delimiter: string, ignoreNulls?: boolean): Expr;
/** Check if strings in Series contain regex pattern. */
contains(pat: string | RegExp): Expr;
/**
* Check if strings in Series contain a substring that matches a pattern.
* @param pat A valid regular expression pattern, compatible with the `regex crate
* @param literal Treat `pattern` as a literal string, not as a regular expression.
* @param strict Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
* @returns Boolean mask
* @example
* ```
* const df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", null]})
* df.select(
* ... pl.col("txt"),
* ... pl.col("txt").str.contains("cat|bit").alias("regex"),
* ... pl.col("txt").str.contains("rab$", true).alias("literal"),
* ... )
* shape: (4, 3)
* ┌─────────────┬───────┬─────────┐
* │ txt ┆ regex ┆ literal │
* │ --- ┆ --- ┆ --- │
* │ str ┆ bool ┆ bool │
* ╞═════════════╪═══════╪═════════╡
* │ Crab ┆ false ┆ false │
* │ cat and dog ┆ true ┆ false │
* │ rab$bit ┆ true ┆ true │
* │ null ┆ null ┆ null │
* └─────────────┴───────┴─────────┘
* ```
*/
contains(
pat: string | RegExp | Expr,
literal?: boolean,
strict?: boolean,
): Expr;
/**
* Decodes a value using the provided encoding
* @param encoding - hex | base64
Expand Down Expand Up @@ -321,8 +351,8 @@ export const ExprStringFunctions = (_expr: any): StringNamespace => {
concat(delimiter: string, ignoreNulls = true) {
return wrap("strConcat", delimiter, ignoreNulls);
},
contains(pat: string | RegExp) {
return wrap("strContains", regexToString(pat), false);
contains(pat: string | Expr, literal = false, strict = true) {
return wrap("strContains", exprToLitOrExpr(pat)._expr, literal, strict);
},
decode(arg, strict = false) {
if (typeof arg === "string") {
Expand Down
38 changes: 32 additions & 6 deletions polars/series/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import type { DataType } from "../datatypes";
import { col } from "../lazy/functions";
import type { StringFunctions } from "../shared_traits";
import { regexToString } from "../utils";
import type { Expr } from "./../lazy/expr/index";
import { type Expr, exprToLitOrExpr } from "./../lazy/expr/index";

/**
* namespace containing series string functions
Expand All @@ -19,11 +19,37 @@ export interface StringNamespace extends StringFunctions<Series> {
*/
concat(delimiter: string, ignoreNulls?: boolean): Series;
/**
* Check if strings in Series contain regex pattern.
* @param pattern A valid regex pattern
* Check if strings in Series contain a substring that matches a pattern.
* @param pat A valid regular expression pattern, compatible with the `regex crate
* @param literal Treat `pattern` as a literal string, not as a regular expression.
* @param strict Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
* @returns Boolean mask
* @example
* ```
* const df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", null]})
* df.select(
* ... pl.col("txt"),
* ... pl.col("txt").str.contains("cat|bit").alias("regex"),
* ... pl.col("txt").str.contains("rab$", true).alias("literal"),
* ... )
* shape: (4, 3)
* ┌─────────────┬───────┬─────────┐
* │ txt ┆ regex ┆ literal │
* │ --- ┆ --- ┆ --- │
* │ str ┆ bool ┆ bool │
* ╞═════════════╪═══════╪═════════╡
* │ Crab ┆ false ┆ false │
* │ cat and dog ┆ true ┆ false │
* │ rab$bit ┆ true ┆ true │
* │ null ┆ null ┆ null │
* └─────────────┴───────┴─────────┘
* ```
*/
contains(pattern: string | RegExp): Series;
contains(
pat: string | RegExp | Expr,
literal?: boolean,
strict?: boolean,
): Series;
/**
* Decodes a value using the provided encoding
* @param encoding - hex | base64
Expand Down Expand Up @@ -279,8 +305,8 @@ export const SeriesStringFunctions = (_s: any): StringNamespace => {
.select(col(_s.name).str.concat(delimiter, ignoreNulls).as(_s.name))
.getColumn(_s.name);
},
contains(pat: string | RegExp) {
return wrap("strContains", regexToString(pat), false);
contains(pat: string | RegExp | Expr, literal = false, strict = true) {
return wrap("strContains", regexToString(pat as RegExp), literal, strict);
},
decode(arg, strict = false) {
if (typeof arg === "string") {
Expand Down
30 changes: 28 additions & 2 deletions polars/shared_traits.ts
Original file line number Diff line number Diff line change
Expand Up @@ -850,8 +850,34 @@ export interface StringFunctions<T> {
* ```
*/
concat(delimiter: string, ignoreNulls?: boolean): T;
/** Check if strings in Series contain regex pattern. */
contains(pat: string | RegExp): T;
/**
* Check if strings in Series contain a substring that matches a pattern.
* @param pat A valid regular expression pattern, compatible with the `regex crate
* @param literal Treat `pattern` as a literal string, not as a regular expression.
* @param strict Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
* @returns Boolean mask
* @example
* ```
* const df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", null]})
* df.select(
* ... pl.col("txt"),
* ... pl.col("txt").str.contains("cat|bit").alias("regex"),
* ... pl.col("txt").str.contains("rab$", true).alias("literal"),
* ... )
* shape: (4, 3)
* ┌─────────────┬───────┬─────────┐
* │ txt ┆ regex ┆ literal │
* │ --- ┆ --- ┆ --- │
* │ str ┆ bool ┆ bool │
* ╞═════════════╪═══════╪═════════╡
* │ Crab ┆ false ┆ false │
* │ cat and dog ┆ true ┆ false │
* │ rab$bit ┆ true ┆ true │
* │ null ┆ null ┆ null │
* └─────────────┴───────┴─────────┘
* ```
*/
contains(pat: string | RegExp | Expr, literal: boolean, strict: boolean): T;
/**
* Decodes a value using the provided encoding
* @param encoding - hex | base64
Expand Down
28 changes: 15 additions & 13 deletions src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -834,19 +834,21 @@ impl JsExpr {
}

#[napi(catch_unwind)]
pub fn str_contains(&self, pat: String, strict: bool) -> JsExpr {
let function = move |s: Column| {
let ca = s.str()?;
match ca.contains(&pat, strict) {
Ok(ca) => Ok(Some(ca.into_column())),
Err(e) => Err(PolarsError::ComputeError(format!("{:?}", e).into())),
}
};
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::Boolean))
.with_fmt("str.contains")
.into()
pub fn str_contains(&self, pat: &JsExpr, literal: bool, strict: bool) -> JsExpr {
match literal {
true => self
.inner
.clone()
.str()
.contains_literal(pat.inner.clone())
.into(),
_ => self
.inner
.clone()
.str()
.contains(pat.inner.clone(), strict)
.into(),
}
}
#[napi(catch_unwind)]
pub fn str_hex_encode(&self) -> JsExpr {
Expand Down

0 comments on commit 9812d84

Please sign in to comment.