From e400c6a5906f57325c1e5981dc092120c19e9332 Mon Sep 17 00:00:00 2001 From: lemorage Date: Fri, 16 May 2025 17:38:06 +0200 Subject: [PATCH 1/4] feat: add support for TimeDelta type in Python and Rust Continuation on #368 --- docs/docs/core/data_types.mdx | 1 + python/cocoindex/typing.py | 3 + src/base/duration.rs | 455 ++++++++++++++++++++++++++++++++++ src/base/json_schema.rs | 11 + src/base/mod.rs | 1 + src/base/schema.rs | 4 + src/base/value.rs | 17 +- src/execution/query.rs | 3 +- src/ops/storages/neo4j.rs | 18 +- src/ops/storages/postgres.rs | 17 +- src/ops/storages/qdrant.rs | 11 +- src/py/convert.rs | 8 +- 12 files changed, 538 insertions(+), 11 deletions(-) create mode 100644 src/base/duration.rs diff --git a/docs/docs/core/data_types.mdx b/docs/docs/core/data_types.mdx index 21d36c5..04c91f5 100644 --- a/docs/docs/core/data_types.mdx +++ b/docs/docs/core/data_types.mdx @@ -35,6 +35,7 @@ This is the list of all basic types supported by CocoIndex: | Time | | `datetime.time` | `datetime.time` | | LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` | | OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` | +| TimeDelta | A duration of time | `cocoindex.typing.TimeDelta` | `datetime.timedelta` | | Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` | | Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package | diff --git a/python/cocoindex/typing.py b/python/cocoindex/typing.py index 496ca5d..90637f7 100644 --- a/python/cocoindex/typing.py +++ b/python/cocoindex/typing.py @@ -29,6 +29,7 @@ def __init__(self, key: str, value: Any): Json = Annotated[Any, TypeKind('Json')] LocalDateTime = Annotated[datetime.datetime, TypeKind('LocalDateTime')] OffsetDateTime = Annotated[datetime.datetime, TypeKind('OffsetDateTime')] +TimeDelta = Annotated[datetime.timedelta, TypeKind('TimeDelta')] if TYPE_CHECKING: T_co = TypeVar('T_co', covariant=True) @@ -168,6 +169,8 @@ def analyze_type_info(t) -> AnalyzedTypeInfo: kind = 'Time' elif t is datetime.datetime: kind = 'OffsetDateTime' + elif t is datetime.timedelta: + kind = 'TimeDelta' else: raise ValueError(f"type unsupported yet: {t}") diff --git a/src/base/duration.rs b/src/base/duration.rs new file mode 100644 index 0000000..96efcb2 --- /dev/null +++ b/src/base/duration.rs @@ -0,0 +1,455 @@ +use anyhow::{Result, anyhow}; +use chrono::Duration; + +/// Parses a duration string into a chrono::Duration. +pub fn parse_duration(s: &str) -> Result { + let s = s.trim(); + if s.is_empty() { + return Err(anyhow!("Empty duration string")); + } + + // Try ISO 8601-like format + if s.starts_with('P') { + let s = s + .strip_prefix('P') + .ok_or_else(|| anyhow!("Invalid duration: {}", s))?; + let mut total = Duration::zero(); + let mut num = String::new(); + let mut negative = false; + + for c in s.chars() { + match c { + '-' if num.is_empty() => { + negative = true; + } + c if c.is_ascii_digit() => { + num.push(c); + } + unit => { + if num.is_empty() { + return Err(anyhow!("Missing number in duration: {}", s)); + } + let n: i64 = num + .parse() + .map_err(|_| anyhow!("Invalid number: {}", num))?; + let n = if negative { -n } else { n }; + match unit.to_ascii_uppercase() { + 'D' => total += Duration::days(n), + 'H' => total += Duration::hours(n), + 'M' => total += Duration::minutes(n), + 'S' => total += Duration::seconds(n), + c => return Err(anyhow!("Invalid unit in duration: {}", c)), + } + num.clear(); + negative = false; + } + } + } + + if total != Duration::zero() { + return Ok(total); + } + return Err(anyhow!("Invalid duration: {}", s)); + } + + // Try human-readable format + let parts: Vec<&str> = s.split_whitespace().collect(); + let mut total = Duration::zero(); + let mut i = 0; + + while i < parts.len() { + let num_str = parts[i]; + let num: i64 = num_str + .parse() + .map_err(|_| anyhow!("Invalid number: {}", num_str))?; + i += 1; + let unit = parts + .get(i) + .ok_or_else(|| anyhow!("Missing unit in duration: {}", s))?; + match unit.to_lowercase().as_str() { + "day" | "days" => total += Duration::days(num), + "hour" | "hours" => total += Duration::hours(num), + "minute" | "minutes" => total += Duration::minutes(num), + "second" | "seconds" => total += Duration::seconds(num), + "millisecond" | "milliseconds" => total += Duration::milliseconds(num), + "microsecond" | "microseconds" => total += Duration::microseconds(num), + unit => return Err(anyhow!("Unknown unit: {}", unit)), + } + i += 1; + } + + if total == Duration::zero() { + return Err(anyhow!("Invalid duration: {}", s)); + } + Ok(total) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn check_ok(res: Result, expected: Duration, input_str: &str) { + match res { + Ok(duration) => assert_eq!(duration, expected, "Input: '{}'", input_str), + Err(e) => panic!( + "Input: '{}', expected Ok({:?}), but got Err: {}", + input_str, expected, e + ), + } + } + + fn check_err_contains(res: Result, expected_substring: &str, input_str: &str) { + match res { + Ok(d) => panic!( + "Input: '{}', expected error containing '{}', but got Ok({:?})", + input_str, expected_substring, d + ), + Err(e) => { + let err_msg = e.to_string(); + assert!( + err_msg.contains(expected_substring), + "Input: '{}', error message '{}' does not contain expected substring '{}'", + input_str, + err_msg, + expected_substring + ); + } + } + } + + #[test] + fn test_empty_string() { + check_err_contains(parse_duration(""), "Empty duration string", "\"\""); + } + + #[test] + fn test_whitespace_string() { + check_err_contains(parse_duration(" "), "Empty duration string", "\" \""); + } + + #[test] + fn test_iso_just_p() { + check_err_contains(parse_duration("P"), "Invalid duration: ", "\"P\""); + } + + #[test] + fn test_iso_missing_number_before_unit() { + check_err_contains( + parse_duration("PH"), + "Missing number in duration: H", + "\"PH\"", + ); + } + + #[test] + fn test_iso_char_t_issue() { + check_err_contains( + parse_duration("PT1H"), + "Missing number in duration: T1H", + "\"PT1H\"", + ); + } + + #[test] + fn test_iso_invalid_number_parse() { + check_err_contains( + parse_duration("P99999999999999999999H"), + "Invalid number: 99999999999999999999", + "\"P99999999999999999999H\"", + ); + } + + #[test] + fn test_iso_invalid_unit() { + check_err_contains( + parse_duration("P1X"), + "Invalid unit in duration: X", + "\"P1X\"", + ); + } + + #[test] + fn test_iso_valid_days() { + check_ok(parse_duration("P1D"), Duration::days(1), "\"P1D\""); + } + + #[test] + fn test_iso_valid_hours() { + check_ok(parse_duration("P2H"), Duration::hours(2), "\"P2H\""); + } + + #[test] + fn test_iso_valid_minutes() { + check_ok(parse_duration("P3M"), Duration::minutes(3), "\"P3M\""); + } + + #[test] + fn test_iso_valid_seconds() { + check_ok(parse_duration("P4S"), Duration::seconds(4), "\"P4S\""); + } + + #[test] + fn test_iso_valid_lowercase_p_and_unit() { + check_err_contains(parse_duration("p1h"), "Invalid number: p1h", "\"p1h\""); + } + + #[test] + fn test_iso_valid_uppercase_p_lowercase_unit() { + check_ok(parse_duration("P1h"), Duration::hours(1), "\"P1h\""); + } + + #[test] + fn test_iso_combined_units() { + check_ok( + parse_duration("P1D2H3M4S"), + Duration::days(1) + Duration::hours(2) + Duration::minutes(3) + Duration::seconds(4), + "\"P1D2H3M4S\"", + ); + } + + #[test] + fn test_iso_trailing_number_bug_demonstration_1() { + check_ok( + parse_duration("P1D2"), + Duration::days(1), + "\"P1D2\" (bug: trailing '2' ignored)", + ); + } + + #[test] + fn test_iso_trailing_number_bug_demonstration_2() { + check_err_contains(parse_duration("P1"), "Invalid duration: 1", "\"P1\""); + } + + #[test] + fn test_iso_zero_duration_pd0() { + check_err_contains(parse_duration("P0D"), "Invalid duration: 0D", "\"P0D\""); + } + + #[test] + fn test_iso_zero_duration_pt0s_fail() { + check_err_contains( + parse_duration("PT0S"), + "Missing number in duration: T0S", + "\"PT0S\"", + ); + } + + #[test] + fn test_iso_fractional_seconds_fail() { + check_err_contains( + parse_duration("PT1.5S"), + "Missing number in duration: T1.5S", + "\"PT1.5S\"", + ); + } + + #[test] + fn test_human_invalid_number_at_start() { + check_err_contains( + parse_duration("one day"), + "Invalid number: one", + "\"one day\"", + ); + } + + #[test] + fn test_human_float_number_fail() { + check_err_contains( + parse_duration("1.5 hours"), + "Invalid number: 1.5", + "\"1.5 hours\"", + ); + } + + #[test] + fn test_human_missing_unit_after_number_at_end() { + check_err_contains(parse_duration("1"), "Missing unit in duration: 1", "\"1\""); + } + + #[test] + fn test_human_missing_unit_after_number_in_middle() { + check_err_contains( + parse_duration("1 day 2"), + "Missing unit in duration: 1 day 2", + "\"1 day 2\"", + ); + } + + #[test] + fn test_human_unknown_unit() { + check_err_contains(parse_duration("1 year"), "Unknown unit: year", "\"1 year\""); + } + + #[test] + fn test_human_valid_day() { + check_ok(parse_duration("1 day"), Duration::days(1), "\"1 day\""); + } + + #[test] + fn test_human_valid_days_uppercase() { + check_ok(parse_duration("2 DAYS"), Duration::days(2), "\"2 DAYS\""); + } + + #[test] + fn test_human_valid_hour() { + check_ok(parse_duration("3 hour"), Duration::hours(3), "\"3 hour\""); + } + + #[test] + fn test_human_valid_hours_mixedcase() { + check_ok(parse_duration("4 HoUrS"), Duration::hours(4), "\"4 HoUrS\""); + } + + #[test] + fn test_human_valid_minute() { + check_ok( + parse_duration("5 minute"), + Duration::minutes(5), + "\"5 minute\"", + ); + } + + #[test] + fn test_human_valid_minutes() { + check_ok( + parse_duration("6 minutes"), + Duration::minutes(6), + "\"6 minutes\"", + ); + } + + #[test] + fn test_human_valid_second() { + check_ok( + parse_duration("7 second"), + Duration::seconds(7), + "\"7 second\"", + ); + } + + #[test] + fn test_human_valid_seconds() { + check_ok( + parse_duration("8 seconds"), + Duration::seconds(8), + "\"8 seconds\"", + ); + } + + #[test] + fn test_human_valid_millisecond() { + check_ok( + parse_duration("9 millisecond"), + Duration::milliseconds(9), + "\"9 millisecond\"", + ); + } + + #[test] + fn test_human_valid_milliseconds() { + check_ok( + parse_duration("10 milliseconds"), + Duration::milliseconds(10), + "\"10 milliseconds\"", + ); + } + + #[test] + fn test_human_valid_microsecond() { + check_ok( + parse_duration("11 microsecond"), + Duration::microseconds(11), + "\"11 microsecond\"", + ); + } + + #[test] + fn test_human_valid_microseconds() { + check_ok( + parse_duration("12 microseconds"), + Duration::microseconds(12), + "\"12 microseconds\"", + ); + } + + #[test] + fn test_human_combined_units() { + let expected = Duration::days(1) + + Duration::hours(2) + + Duration::minutes(30) + + Duration::seconds(15) + + Duration::milliseconds(100) + + Duration::microseconds(200); + check_ok( + parse_duration("1 day 2 hours 30 minutes 15 seconds 100 milliseconds 200 microseconds"), + expected, + "\"1 day 2 hours 30 minutes 15 seconds 100 milliseconds 200 microseconds\"", + ); + } + + #[test] + fn test_human_zero_duration_seconds() { + check_err_contains( + parse_duration("0 seconds"), + "Invalid duration: 0 seconds", + "\"0 seconds\"", + ); + } + + #[test] + fn test_human_zero_duration_days_hours() { + check_err_contains( + parse_duration("0 day 0 hour"), + "Invalid duration: 0 day 0 hour", + "\"0 day 0 hour\"", + ); + } + + #[test] + fn test_human_no_space_between_num_unit() { + check_err_contains(parse_duration("1day"), "Invalid number: 1day", "\"1day\""); + } + + #[test] + fn test_human_extra_whitespace() { + check_ok( + parse_duration(" 1 day 2 hours "), + Duration::days(1) + Duration::hours(2), + "\" 1 day 2 hours \"", + ); + } + + #[test] + fn test_human_just_unit_no_number() { + check_err_contains(parse_duration("day"), "Invalid number: day", "\"day\""); + } + #[test] + fn test_human_negative_numbers() { + check_ok( + parse_duration("-1 day 2 hours"), + Duration::days(-1) + Duration::hours(2), + "\"-1 day 2 hours\"", + ); + check_ok( + parse_duration("1 day -2 hours"), + Duration::days(1) + Duration::hours(-2), + "\"1 day -2 hours\"", + ); + } + + #[test] + fn test_iso_negative_numbers() { + check_ok(parse_duration("P-1D"), Duration::days(-1), "\"P-1D\""); + check_ok( + parse_duration("P-1H-2M"), + Duration::hours(-1) + Duration::minutes(-2), + "\"P-1H-2M\"", + ); + check_err_contains( + parse_duration("P-1-2H"), + "Invalid unit in duration: -", + "\"P-1-2H\"", + ); + } +} diff --git a/src/base/json_schema.rs b/src/base/json_schema.rs index 2e78719..9ae9b2b 100644 --- a/src/base/json_schema.rs +++ b/src/base/json_schema.rs @@ -150,6 +150,17 @@ impl JsonSchemaBuilder { field_path, ); } + &schema::BasicValueType::TimeDelta => { + schema.instance_type = Some(SingleOrVec::Single(Box::new(InstanceType::String))); + if self.options.supports_format { + schema.format = Some("duration".to_string()); + } + self.set_description( + &mut schema, + "A duration, e.g. 'PT1H2M3S' (ISO 8601) or '1 day 2 hours 3 seconds'", + field_path, + ); + } schema::BasicValueType::Json => { // Can be any value. No type constraint. } diff --git a/src/base/mod.rs b/src/base/mod.rs index fb6a169..74bc90f 100644 --- a/src/base/mod.rs +++ b/src/base/mod.rs @@ -1,3 +1,4 @@ +pub mod duration; pub mod field_attrs; pub mod json_schema; pub mod schema; diff --git a/src/base/schema.rs b/src/base/schema.rs index d5d7bec..d32dd27 100644 --- a/src/base/schema.rs +++ b/src/base/schema.rs @@ -48,6 +48,9 @@ pub enum BasicValueType { /// Date and time with timezone. OffsetDateTime, + /// A time duration. + TimeDelta, + /// A JSON value. Json, @@ -70,6 +73,7 @@ impl std::fmt::Display for BasicValueType { BasicValueType::Time => write!(f, "Time"), BasicValueType::LocalDateTime => write!(f, "LocalDateTime"), BasicValueType::OffsetDateTime => write!(f, "OffsetDateTime"), + BasicValueType::TimeDelta => write!(f, "TimeDelta"), BasicValueType::Json => write!(f, "Json"), BasicValueType::Vector(s) => { write!(f, "Vector[{}", s.element_type)?; diff --git a/src/base/value.rs b/src/base/value.rs index c3266fb..7264b75 100644 --- a/src/base/value.rs +++ b/src/base/value.rs @@ -1,4 +1,5 @@ use super::schema::*; +use crate::base::duration::parse_duration; use crate::{api_bail, api_error}; use anyhow::Result; use base64::prelude::*; @@ -6,9 +7,9 @@ use bytes::Bytes; use chrono::Offset; use log::warn; use serde::{ + Deserialize, Serialize, de::{SeqAccess, Visitor}, ser::{SerializeMap, SerializeSeq, SerializeTuple}, - Deserialize, Serialize, }; use std::{collections::BTreeMap, ops::Deref, sync::Arc}; @@ -354,6 +355,7 @@ pub enum BasicValue { Time(chrono::NaiveTime), LocalDateTime(chrono::NaiveDateTime), OffsetDateTime(chrono::DateTime), + TimeDelta(chrono::Duration), Json(Arc), Vector(Arc<[BasicValue]>), } @@ -436,6 +438,12 @@ impl From> for BasicValue { } } +impl From for BasicValue { + fn from(value: chrono::Duration) -> Self { + BasicValue::TimeDelta(value) + } +} + impl From for BasicValue { fn from(value: serde_json::Value) -> Self { BasicValue::Json(Arc::from(value)) @@ -465,6 +473,7 @@ impl BasicValue { | BasicValue::Time(_) | BasicValue::LocalDateTime(_) | BasicValue::OffsetDateTime(_) + | BasicValue::TimeDelta(_) | BasicValue::Json(_) | BasicValue::Vector(_) => api_bail!("invalid key value type"), }; @@ -485,6 +494,7 @@ impl BasicValue { | BasicValue::Time(_) | BasicValue::LocalDateTime(_) | BasicValue::OffsetDateTime(_) + | BasicValue::TimeDelta(_) | BasicValue::Json(_) | BasicValue::Vector(_) => api_bail!("invalid key value type"), }; @@ -505,6 +515,7 @@ impl BasicValue { BasicValue::Time(_) => "time", BasicValue::LocalDateTime(_) => "local_datetime", BasicValue::OffsetDateTime(_) => "offset_datetime", + BasicValue::TimeDelta(_) => "timedelta", BasicValue::Json(_) => "json", BasicValue::Vector(_) => "vector", } @@ -855,6 +866,7 @@ impl serde::Serialize for BasicValue { BasicValue::OffsetDateTime(v) => { serializer.serialize_str(&v.to_rfc3339_opts(chrono::SecondsFormat::AutoSi, true)) } + BasicValue::TimeDelta(v) => serializer.serialize_str(&v.to_string()), BasicValue::Json(v) => v.serialize(serializer), BasicValue::Vector(v) => v.serialize(serializer), } @@ -907,6 +919,9 @@ impl BasicValue { } } } + (serde_json::Value::String(v), BasicValueType::TimeDelta) => { + BasicValue::TimeDelta(parse_duration(&v)?) + } (v, BasicValueType::Json) => BasicValue::Json(Arc::from(v)), ( serde_json::Value::Array(v), diff --git a/src/execution/query.rs b/src/execution/query.rs index 0f8ebe5..9052096 100644 --- a/src/execution/query.rs +++ b/src/execution/query.rs @@ -1,6 +1,6 @@ use std::{sync::Arc, vec}; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use serde::Serialize; use super::evaluator::evaluate_transient_flow; @@ -92,6 +92,7 @@ impl SimpleSemanticsQueryHandler { | value::BasicValue::Time(_) | value::BasicValue::LocalDateTime(_) | value::BasicValue::OffsetDateTime(_) + | value::BasicValue::TimeDelta(_) | value::BasicValue::Json(_) | value::BasicValue::Vector(_) => { bail!("Query results is not a vector of number") diff --git a/src/ops/storages/neo4j.rs b/src/ops/storages/neo4j.rs index b25334d..4b639fe 100644 --- a/src/ops/storages/neo4j.rs +++ b/src/ops/storages/neo4j.rs @@ -1,7 +1,7 @@ use crate::prelude::*; use super::spec::{GraphDeclaration, GraphElementMapping, NodeFromFieldsSpec, TargetFieldMapping}; -use crate::setup::components::{self, apply_component_changes, State}; +use crate::setup::components::{self, State, apply_component_changes}; use crate::setup::{ResourceSetupStatus, SetupChangeType}; use crate::{ops::sdk::*, setup::CombinedState}; @@ -266,6 +266,14 @@ fn basic_value_to_bolt(value: &BasicValue, schema: &BasicValueType) -> Result BoltType::DateTime(neo4rs::BoltDateTime::from(*v)), + BasicValue::TimeDelta(v) => BoltType::Duration(neo4rs::BoltDuration::new( + neo4rs::BoltInteger { value: 0 }, + neo4rs::BoltInteger { value: 0 }, + neo4rs::BoltInteger { + value: v.num_seconds(), + }, + v.subsec_nanos().into(), + )), BasicValue::Vector(v) => match schema { BasicValueType::Vector(t) => BoltType::List(neo4rs::BoltList { value: v @@ -772,7 +780,9 @@ impl components::SetupOperator for SetupComponentOperator { metric, vector_size, } => { - format!("{key_desc} ON {label} (field_name: {field_name}, vector_size: {vector_size}, metric: {metric})",) + format!( + "{key_desc} ON {label} (field_name: {field_name}, vector_size: {vector_size}, metric: {metric})", + ) } } } @@ -794,8 +804,8 @@ impl components::SetupOperator for SetupComponentOperator { }; format!( "CREATE CONSTRAINT {name} IF NOT EXISTS FOR {matcher} REQUIRE {field_names} IS {key_type} KEY", - name=key.name, - field_names=build_composite_field_names(qualifier, &field_names), + name = key.name, + field_names = build_composite_field_names(qualifier, &field_names), ) } IndexDef::VectorIndex { diff --git a/src/ops/storages/postgres.rs b/src/ops/storages/postgres.rs index 6e630a2..a6f163e 100644 --- a/src/ops/storages/postgres.rs +++ b/src/ops/storages/postgres.rs @@ -11,8 +11,8 @@ use futures::FutureExt; use indexmap::{IndexMap, IndexSet}; use itertools::Itertools; use serde::Serialize; -use sqlx::postgres::types::PgRange; use sqlx::postgres::PgRow; +use sqlx::postgres::types::PgRange; use sqlx::{PgPool, Row}; use std::ops::Bound; use uuid::Uuid; @@ -132,6 +132,9 @@ fn bind_value_field<'arg>( BasicValue::OffsetDateTime(v) => { builder.push_bind(v); } + BasicValue::TimeDelta(v) => { + builder.push_bind(v); + } BasicValue::Json(v) => { builder.push_bind(sqlx::types::Json(&**v)); } @@ -217,6 +220,17 @@ fn from_pg_value(row: &PgRow, field_idx: usize, typ: &ValueType) -> Result row .try_get::>, _>(field_idx)? .map(BasicValue::OffsetDateTime), + // BasicValueType::TimeDelta => row + // .try_get::, _>(field_idx)? + // .map(BasicValue::TimeDelta), + BasicValueType::TimeDelta => row + .try_get::, _>(field_idx)? + .map(|pg_interval| { + let duration = chrono::Duration::microseconds(pg_interval.microseconds) + + chrono::Duration::days(pg_interval.days as i64) + + chrono::Duration::days((pg_interval.months as i64) * 30); + BasicValue::TimeDelta(duration) + }), BasicValueType::Json => row .try_get::, _>(field_idx)? .map(|v| BasicValue::Json(Arc::from(v))), @@ -691,6 +705,7 @@ fn to_column_type_sql(column_type: &ValueType) -> Cow<'static, str> { BasicValueType::Time => "time".into(), BasicValueType::LocalDateTime => "timestamp".into(), BasicValueType::OffsetDateTime => "timestamp with time zone".into(), + BasicValueType::TimeDelta => "interval".into(), BasicValueType::Json => "jsonb".into(), BasicValueType::Vector(vec_schema) => { if convertible_to_pgvector(vec_schema) { diff --git a/src/ops/storages/qdrant.rs b/src/ops/storages/qdrant.rs index 3b1e789..f4724eb 100644 --- a/src/ops/storages/qdrant.rs +++ b/src/ops/storages/qdrant.rs @@ -3,17 +3,18 @@ use std::convert::Infallible; use std::fmt::Display; use std::sync::Arc; +use crate::base::duration::parse_duration; use crate::ops::sdk::*; use crate::setup; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use futures::FutureExt; +use qdrant_client::Qdrant; use qdrant_client::qdrant::vectors_output::VectorsOptions; use qdrant_client::qdrant::{ DeletePointsBuilder, NamedVectors, PointId, PointStruct, PointsIdsList, UpsertPointsBuilder, Value as QdrantValue, }; use qdrant_client::qdrant::{Query, QueryPointsBuilder, ScoredPoint}; -use qdrant_client::Qdrant; use serde_json::json; #[derive(Debug, Deserialize, Clone)] @@ -131,6 +132,7 @@ fn values_to_payload( BasicValue::LocalDateTime(v) => v.to_string().into(), BasicValue::Time(v) => v.to_string().into(), BasicValue::OffsetDateTime(v) => v.to_string().into(), + BasicValue::TimeDelta(v) => v.to_string().into(), BasicValue::Json(v) => (**v).clone(), BasicValue::Vector(v) => { let vector = convert_to_vector(v.to_vec()); @@ -191,6 +193,11 @@ fn into_value(point: &ScoredPoint, schema: &FieldSchema) -> Result { .get(field_name) .and_then(|v| v.as_double().map(BasicValue::Float64)), + BasicValueType::TimeDelta => point.payload.get(field_name).and_then(|v| { + v.as_str() + .and_then(|s| parse_duration(s).ok().map(BasicValue::TimeDelta)) + }), + BasicValueType::Json => point .payload .get(field_name) diff --git a/src/py/convert.rs b/src/py/convert.rs index 327ba82..a09d3d4 100644 --- a/src/py/convert.rs +++ b/src/py/convert.rs @@ -1,10 +1,10 @@ use bytes::Bytes; -use pyo3::types::{PyList, PyTuple}; use pyo3::IntoPyObjectExt; +use pyo3::types::{PyList, PyTuple}; use pyo3::{exceptions::PyException, prelude::*}; use pythonize::{depythonize, pythonize}; -use serde::de::DeserializeOwned; use serde::Serialize; +use serde::de::DeserializeOwned; use std::collections::BTreeMap; use std::ops::Deref; use std::sync::Arc; @@ -70,6 +70,7 @@ fn basic_value_to_py_object<'py>( value::BasicValue::Time(v) => v.into_bound_py_any(py)?, value::BasicValue::LocalDateTime(v) => v.into_bound_py_any(py)?, value::BasicValue::OffsetDateTime(v) => v.into_bound_py_any(py)?, + value::BasicValue::TimeDelta(v) => v.into_bound_py_any(py)?, value::BasicValue::Json(v) => pythonize(py, v).into_py_result()?, value::BasicValue::Vector(v) => v .iter() @@ -143,6 +144,9 @@ fn basic_value_from_py_object<'py>( schema::BasicValueType::OffsetDateTime => { value::BasicValue::OffsetDateTime(v.extract::>()?) } + schema::BasicValueType::TimeDelta => { + value::BasicValue::TimeDelta(v.extract::()?) + } schema::BasicValueType::Json => { value::BasicValue::Json(Arc::from(depythonize::(v)?)) } From d903d066346b28c1b9bb693d984c1d3b8a2b32b0 Mon Sep 17 00:00:00 2001 From: lemorage Date: Sun, 18 May 2025 11:25:22 +0200 Subject: [PATCH 2/4] refactor: update duration parser to align with ISO 8601 standard - Replaced naive parsing with structured component-based approach - Enhanced both iso8601 and human-readable parsers with proper validation and unit order checks - Added more tests, achieving 97.52% test coverage --- docs/docs/core/data_types.mdx | 2 +- src/base/duration.rs | 612 +++++++++++++++++++++++++--------- 2 files changed, 453 insertions(+), 161 deletions(-) diff --git a/docs/docs/core/data_types.mdx b/docs/docs/core/data_types.mdx index 04c91f5..18a09bf 100644 --- a/docs/docs/core/data_types.mdx +++ b/docs/docs/core/data_types.mdx @@ -35,7 +35,7 @@ This is the list of all basic types supported by CocoIndex: | Time | | `datetime.time` | `datetime.time` | | LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` | | OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` | -| TimeDelta | A duration of time | `cocoindex.typing.TimeDelta` | `datetime.timedelta` | +| TimeDelta | A duration of time | `datetime.TimeDelta` | `datetime.timedelta` | | Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` | | Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package | diff --git a/src/base/duration.rs b/src/base/duration.rs index 96efcb2..8719b47 100644 --- a/src/base/duration.rs +++ b/src/base/duration.rs @@ -1,87 +1,242 @@ use anyhow::{Result, anyhow}; use chrono::Duration; -/// Parses a duration string into a chrono::Duration. -pub fn parse_duration(s: &str) -> Result { - let s = s.trim(); - if s.is_empty() { - return Err(anyhow!("Empty duration string")); +/// Parses a string of number-unit pairs into a vector of (number, unit), +/// ensuring units are among the allowed ones. +fn parse_components( + s: &str, + allowed_units: &[char], + original_input: &str, +) -> Result> { + let mut result = Vec::new(); + let mut iter = s.chars().peekable(); + while iter.peek().is_some() { + let mut num_str = String::new(); + while let Some(&c) = iter.peek() { + if c.is_digit(10) { + num_str.push(iter.next().unwrap()); + } else { + break; + } + } + if num_str.is_empty() { + return Err(anyhow!("Expected number in: {}", original_input)); + } + let num = num_str + .parse() + .map_err(|_| anyhow!("Invalid number '{}' in: {}", num_str, original_input))?; + if let Some(&unit) = iter.peek() { + if allowed_units.contains(&unit) { + result.push((num, unit)); + iter.next(); + } else { + return Err(anyhow!("Invalid unit '{}' in: {}", unit, original_input)); + } + } else { + return Err(anyhow!( + "Missing unit after number '{}' in: {}", + num_str, + original_input + )); + } } + Ok(result) +} - // Try ISO 8601-like format - if s.starts_with('P') { - let s = s - .strip_prefix('P') - .ok_or_else(|| anyhow!("Invalid duration: {}", s))?; - let mut total = Duration::zero(); - let mut num = String::new(); - let mut negative = false; - - for c in s.chars() { - match c { - '-' if num.is_empty() => { - negative = true; - } - c if c.is_ascii_digit() => { - num.push(c); - } - unit => { - if num.is_empty() { - return Err(anyhow!("Missing number in duration: {}", s)); - } - let n: i64 = num - .parse() - .map_err(|_| anyhow!("Invalid number: {}", num))?; - let n = if negative { -n } else { n }; - match unit.to_ascii_uppercase() { - 'D' => total += Duration::days(n), - 'H' => total += Duration::hours(n), - 'M' => total += Duration::minutes(n), - 'S' => total += Duration::seconds(n), - c => return Err(anyhow!("Invalid unit in duration: {}", c)), - } - num.clear(); - negative = false; - } +/// Checks if the sequence of units follows the specified order. +fn check_order(units: &[char], order: &[char], input: &str) -> Result<()> { + let mut last_pos = -1; + for &unit in units { + if let Some(pos) = order.iter().position(|&u| u == unit) { + if pos as i32 <= last_pos { + return Err(anyhow!("Units out of order in: {}", input)); } + last_pos = pos as i32; + } else { + return Err(anyhow!("Invalid unit '{}' in: {}", unit, input)); } + } + Ok(()) +} + +/// Parses an ISO 8601 duration string into a `chrono::Duration`. +fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { + let (is_negative, s_after_sign) = if s.starts_with('-') { + (true, &s[1..]) + } else { + (false, s) + }; + + if !s_after_sign.starts_with('P') { + return Err(anyhow!( + "Duration must start with 'P' in: {}", + original_input + )); + } + let s_after_p = &s_after_sign[1..]; + + let (date_part, time_part) = if let Some(pos) = s_after_p.find('T') { + (&s_after_p[..pos], Some(&s_after_p[pos + 1..])) + } else { + (s_after_p, None) + }; + + // Date components (Y, M, W, D) + let date_components = parse_components(date_part, &['Y', 'M', 'W', 'D'], original_input)?; + + // Time components (H, M, S) + let time_components = if let Some(time_str) = time_part { + let comps = parse_components(time_str, &['H', 'M', 'S'], original_input)?; + if comps.is_empty() { + return Err(anyhow!( + "Time part present but no time components in: {}", + original_input + )); + } + comps + } else { + vec![] + }; + + // Duplicate units not allowed in date part + let mut seen_date_units = std::collections::HashSet::new(); + for &(_, unit) in &date_components { + if !seen_date_units.insert(unit) { + return Err(anyhow!( + "Duplicate '{}' in date part: {}", + unit, + original_input + )); + } + } - if total != Duration::zero() { - return Ok(total); + // Duplicate units not allowed in time part + let mut seen_time_units = std::collections::HashSet::new(); + for &(_, unit) in &time_components { + if !seen_time_units.insert(unit) { + return Err(anyhow!( + "Duplicate '{}' in time part: {}", + unit, + original_input + )); } - return Err(anyhow!("Invalid duration: {}", s)); } - // Try human-readable format + // Check date units are in order + let date_units: Vec = date_components.iter().map(|&(_, u)| u).collect(); + check_order(&date_units, &['Y', 'M', 'W', 'D'], original_input)?; + + // Check time units are in order + let time_units: Vec = time_components.iter().map(|&(_, u)| u).collect(); + check_order(&time_units, &['H', 'M', 'S'], original_input)?; + + if date_components.is_empty() && time_components.is_empty() { + return Err(anyhow!("No components in duration: {}", original_input)); + } + + // Accumulate date duration + let date_duration = + date_components + .iter() + .fold(Duration::zero(), |acc, &(num, unit)| match unit { + 'Y' => acc + Duration::days(num * 365), + 'M' => acc + Duration::days(num * 30), + 'W' => acc + Duration::days(num * 7), + 'D' => acc + Duration::days(num), + _ => unreachable!("Invalid date unit should be caught by prior validation"), + }); + + // Accumulate time duration + let time_duration = + time_components + .iter() + .fold(Duration::zero(), |acc, &(num, unit)| match unit { + 'H' => acc + Duration::hours(num), + 'M' => acc + Duration::minutes(num), + 'S' => acc + Duration::seconds(num), + _ => unreachable!("Invalid time unit should be caught by prior validation"), + }); + + let mut total = date_duration + time_duration; + if is_negative { + total = -total; + } + + Ok(total) +} + +/// Parses a human-readable duration string into a `chrono::Duration`. +fn parse_human_readable_duration(s: &str, original_input: &str) -> Result { let parts: Vec<&str> = s.split_whitespace().collect(); - let mut total = Duration::zero(); - let mut i = 0; + if parts.is_empty() || parts.len() % 2 != 0 { + return Err(anyhow!( + "Invalid human-readable duration format in: {}", + original_input + )); + } + + let components: Vec<(i64, &str)> = parts + .chunks(2) + .map(|chunk| { + let num_str = chunk[0]; + let num = num_str + .parse() + .map_err(|_| anyhow!("Invalid number '{}' in: {}", num_str, original_input))?; + Ok((num, chunk[1])) + }) + .collect::>>()?; + + let units: Vec<&str> = components.iter().map(|&(_, u)| u).collect(); + check_order( + &units + .iter() + .map(|u| u.to_lowercase().chars().next().unwrap()) + .collect::>(), + &['d', 'h', 'm', 's', 'i', 'c'], // Abbreviation of day, hour, minute, second, millisecond, microsecond + original_input, + )?; + + let total = components + .iter() + .fold(Duration::zero(), |acc, &(num, unit)| { + match unit.to_lowercase().as_str() { + "day" | "days" => acc + Duration::days(num), + "hour" | "hours" => acc + Duration::hours(num), + "minute" | "minutes" => acc + Duration::minutes(num), + "second" | "seconds" => acc + Duration::seconds(num), + "millisecond" | "milliseconds" => acc + Duration::milliseconds(num), + "microsecond" | "microseconds" => acc + Duration::microseconds(num), + _ => unreachable!("Invalid unit should be caught by prior validation"), + } + }); - while i < parts.len() { - let num_str = parts[i]; - let num: i64 = num_str - .parse() - .map_err(|_| anyhow!("Invalid number: {}", num_str))?; - i += 1; - let unit = parts - .get(i) - .ok_or_else(|| anyhow!("Missing unit in duration: {}", s))?; - match unit.to_lowercase().as_str() { - "day" | "days" => total += Duration::days(num), - "hour" | "hours" => total += Duration::hours(num), - "minute" | "minutes" => total += Duration::minutes(num), - "second" | "seconds" => total += Duration::seconds(num), - "millisecond" | "milliseconds" => total += Duration::milliseconds(num), - "microsecond" | "microseconds" => total += Duration::microseconds(num), - unit => return Err(anyhow!("Unknown unit: {}", unit)), + Ok(total) +} + +/// Parses a duration string into a `chrono::Duration`, trying ISO 8601 first, then human-readable format. +pub fn parse_duration(s: &str) -> Result { + let original_input = s; + let s = s.trim(); + if s.is_empty() { + return Err(anyhow!("Empty duration string")); + } + + fn is_likely_iso8601(s: &str) -> bool { + match s.as_bytes() { + [c, ..] if c.eq_ignore_ascii_case(&b'P') => true, + [b'-', c, ..] if c.eq_ignore_ascii_case(&b'P') => true, + _ => false, } - i += 1; } - if total == Duration::zero() { - return Err(anyhow!("Invalid duration: {}", s)); + if is_likely_iso8601(s) { + // Try ISO 8601 format without fallbacks + return parse_iso8601_duration(s, original_input); } - Ok(total) + + // Allow maybe ISO-ish formats, so still try iso8601 first + parse_iso8601_duration(s, original_input) + .or_else(|_| parse_human_readable_duration(s, original_input)) } #[cfg(test)] @@ -129,45 +284,161 @@ mod tests { #[test] fn test_iso_just_p() { - check_err_contains(parse_duration("P"), "Invalid duration: ", "\"P\""); + check_err_contains(parse_duration("P"), "No components in duration: P", "\"P\""); } #[test] - fn test_iso_missing_number_before_unit() { + fn test_iso_pt() { check_err_contains( - parse_duration("PH"), - "Missing number in duration: H", - "\"PH\"", + parse_duration("PT"), + "Time part present but no time components in: PT", + "\"PT\"", ); } #[test] - fn test_iso_char_t_issue() { - check_err_contains( - parse_duration("PT1H"), - "Missing number in duration: T1H", - "\"PT1H\"", - ); + fn test_iso_missing_number_before_unit_in_date_part() { + check_err_contains(parse_duration("PD"), "Expected number in: PD", "\"PD\""); + } + #[test] + fn test_iso_missing_number_before_unit_in_time_part() { + check_err_contains(parse_duration("PTM"), "Expected number in: PTM", "\"PTM\""); + } + + #[test] + fn test_iso_time_unit_without_t() { + check_err_contains(parse_duration("P1H"), "Invalid unit 'H' in: P1H", "\"P1H\""); + check_err_contains(parse_duration("P1S"), "Invalid unit 'S' in: P1S", "\"P1S\""); } #[test] fn test_iso_invalid_number_parse() { check_err_contains( - parse_duration("P99999999999999999999H"), - "Invalid number: 99999999999999999999", - "\"P99999999999999999999H\"", + parse_duration("PT99999999999999999999H"), + "Invalid number '99999999999999999999' in: PT99999999999999999999H", + "\"PT99999999999999999999H\"", ); } #[test] fn test_iso_invalid_unit() { + check_err_contains(parse_duration("P1X"), "Invalid unit 'X' in: P1X", "\"P1X\""); + check_err_contains( + parse_duration("PT1X"), + "Invalid unit 'X' in: PT1X", + "\"PT1X\"", + ); + } + + #[test] + fn test_iso_valid_lowercase_unit_is_not_allowed() { + check_err_contains( + parse_duration("p1h"), + "Duration must start with 'P' in: p1h", + "\"p1h\"", + ); + check_err_contains( + parse_duration("PT1h"), + "Invalid unit 'h' in: PT1h", + "\"PT1h\"", + ); + } + + #[test] + fn test_iso_trailing_number_error() { + check_err_contains( + parse_duration("P1D2"), + "Missing unit after number '2' in: P1D2", + "\"P1D2\"", + ); + } + + #[test] + fn test_iso_trailing_number_without_unit_after_p() { + check_err_contains( + parse_duration("P1"), + "Missing unit after number '1' in: P1", + "\"P1\"", + ); + } + + #[test] + fn test_iso_fractional_seconds_fail() { + check_err_contains( + parse_duration("PT1.5S"), + "Invalid unit '.' in: PT1.5S", + "\"PT1.5S\"", + ); + } + + #[test] + fn test_iso_misplaced_t() { + check_err_contains( + parse_duration("P1DT2H T3M"), + "Expected number in: P1DT2H T3M", + "\"P1DT2H T3M\"", + ); + check_err_contains( + parse_duration("P1T2H"), + "Missing unit after number '1' in: P1T2H", + "\"P1T2H\"", + ); + } + + #[test] + fn test_iso_out_of_order_unit() { + check_err_contains( + parse_duration("P1H2D"), + "Invalid unit 'H' in: P1H2D", // Why not units out of order + "\"P1H2D\"", + ); + check_err_contains( + parse_duration("PT2S1H"), + "Units out of order in: PT2S1H", + "\"PT2S1H\"", + ); + check_err_contains( + parse_duration("P1W1Y"), + "Units out of order in: P1W1Y", + "\"P1W1Y\"", + ); + } + + #[test] + fn test_iso_duplicated_unit() { + check_err_contains( + parse_duration("P1D1D"), + "Duplicate 'D' in date part: P1D1D", + "\"P1D1D\"", + ); + check_err_contains( + parse_duration("PT1H1H"), + "Duplicate 'H' in time part: PT1H1H", + "\"PT1H1H\"", + ); + } + + #[test] + fn test_iso_negative_number_after_p() { check_err_contains( - parse_duration("P1X"), - "Invalid unit in duration: X", - "\"P1X\"", + parse_duration("P-1D"), + "Expected number in: P-1D", + "\"P-1D\"", ); } + #[test] + fn test_iso_valid_months() { + check_ok(parse_duration("P1M"), Duration::days(30), "\"P1M\""); + check_ok(parse_duration(" P13M"), Duration::days(13 * 30), "\"P13M\""); + } + + #[test] + fn test_iso_valid_weeks() { + check_ok(parse_duration("P1W"), Duration::days(7), "\"P1W\""); + check_ok(parse_duration(" P1W "), Duration::days(7), "\"P1W\""); + } + #[test] fn test_iso_valid_days() { check_ok(parse_duration("P1D"), Duration::days(1), "\"P1D\""); @@ -175,72 +446,81 @@ mod tests { #[test] fn test_iso_valid_hours() { - check_ok(parse_duration("P2H"), Duration::hours(2), "\"P2H\""); + check_ok(parse_duration("PT2H"), Duration::hours(2), "\"PT2H\""); } #[test] fn test_iso_valid_minutes() { - check_ok(parse_duration("P3M"), Duration::minutes(3), "\"P3M\""); + check_ok(parse_duration("PT3M"), Duration::minutes(3), "\"PT3M\""); } #[test] fn test_iso_valid_seconds() { - check_ok(parse_duration("P4S"), Duration::seconds(4), "\"P4S\""); + check_ok(parse_duration("PT4S"), Duration::seconds(4), "\"PT4S\""); } #[test] - fn test_iso_valid_lowercase_p_and_unit() { - check_err_contains(parse_duration("p1h"), "Invalid number: p1h", "\"p1h\""); + fn test_iso_combined_units() { + check_ok( + parse_duration("P1Y2M3W4DT5H6M7S"), + Duration::days(365 + 60 + 3 * 7 + 4) + + Duration::hours(5) + + Duration::minutes(6) + + Duration::seconds(7), + "\"P1Y2M3DT4H5M6S\"", + ); + check_ok( + parse_duration("P1DT2H3M4S"), + Duration::days(1) + Duration::hours(2) + Duration::minutes(3) + Duration::seconds(4), + "\"P1DT2H3M4S\"", + ); } #[test] - fn test_iso_valid_uppercase_p_lowercase_unit() { - check_ok(parse_duration("P1h"), Duration::hours(1), "\"P1h\""); + fn test_iso_negative_duration_p1d() { + check_ok(parse_duration("-P1D"), -Duration::days(1), "\"-P1D\""); } #[test] - fn test_iso_combined_units() { - check_ok( - parse_duration("P1D2H3M4S"), - Duration::days(1) + Duration::hours(2) + Duration::minutes(3) + Duration::seconds(4), - "\"P1D2H3M4S\"", - ); + fn test_iso_zero_duration_pd0() { + check_ok(parse_duration("P0D"), Duration::zero(), "\"P0D\""); } #[test] - fn test_iso_trailing_number_bug_demonstration_1() { - check_ok( - parse_duration("P1D2"), - Duration::days(1), - "\"P1D2\" (bug: trailing '2' ignored)", - ); + fn test_iso_zero_duration_pt0s() { + check_ok(parse_duration("PT0S"), Duration::zero(), "\"PT0S\""); } #[test] - fn test_iso_trailing_number_bug_demonstration_2() { - check_err_contains(parse_duration("P1"), "Invalid duration: 1", "\"P1\""); + fn test_iso_zero_duration_pt0h0m0s() { + check_ok(parse_duration("PT0H0M0S"), Duration::zero(), "\"PT0H0M0S\""); } + // Human-readable Tests #[test] - fn test_iso_zero_duration_pd0() { - check_err_contains(parse_duration("P0D"), "Invalid duration: 0D", "\"P0D\""); + fn test_human_missing_unit() { + check_err_contains( + parse_duration("1"), + "Invalid human-readable duration format in: 1", + "\"1\"", + ); } #[test] - fn test_iso_zero_duration_pt0s_fail() { + fn test_human_missing_number() { check_err_contains( - parse_duration("PT0S"), - "Missing number in duration: T0S", - "\"PT0S\"", + parse_duration("day"), + "Invalid human-readable duration format in: day", + "\"day\"", ); } #[test] - fn test_iso_fractional_seconds_fail() { + fn test_human_incomplete_pair() { check_err_contains( - parse_duration("PT1.5S"), - "Missing number in duration: T1.5S", - "\"PT1.5S\"", + parse_duration("1 day 2"), + "Invalid human-readable duration format in: 1 day 2", + "\"1 day 2\"", ); } @@ -248,37 +528,54 @@ mod tests { fn test_human_invalid_number_at_start() { check_err_contains( parse_duration("one day"), - "Invalid number: one", + "Invalid number 'one' in: one day", "\"one day\"", ); } #[test] - fn test_human_float_number_fail() { + fn test_human_invalid_unit() { check_err_contains( - parse_duration("1.5 hours"), - "Invalid number: 1.5", - "\"1.5 hours\"", + parse_duration("1 hour 2 minutes 3 seconds four seconds"), + "Invalid number 'four' in: 1 hour 2 minutes 3 seconds four seconds", + "\"1 hour 2 minutes 3 seconds four seconds\"", ); } #[test] - fn test_human_missing_unit_after_number_at_end() { - check_err_contains(parse_duration("1"), "Missing unit in duration: 1", "\"1\""); + fn test_human_out_of_order() { + check_err_contains( + parse_duration("1 second 2 hours"), + "Units out of order in: 1 second 2 hours", + "\"1 second 2 hours\"", + ); } #[test] - fn test_human_missing_unit_after_number_in_middle() { + fn test_human_float_number_fail() { check_err_contains( - parse_duration("1 day 2"), - "Missing unit in duration: 1 day 2", - "\"1 day 2\"", + parse_duration("1.5 hours"), + "Invalid number '1.5' in: 1.5 hours", + "\"1.5 hours\"", + ); + } + + #[test] + fn test_invalid_human_readable_no_pairs() { + check_err_contains( + parse_duration("just some words"), + "Invalid human-readable duration format in: just some words", + "\"just some words\"", ); } #[test] fn test_human_unknown_unit() { - check_err_contains(parse_duration("1 year"), "Unknown unit: year", "\"1 year\""); + check_err_contains( + parse_duration("1 year"), + "Invalid unit 'y' in: 1 year", + "\"1 year\"", + ); } #[test] @@ -374,41 +671,55 @@ mod tests { } #[test] - fn test_human_combined_units() { - let expected = Duration::days(1) - + Duration::hours(2) - + Duration::minutes(30) - + Duration::seconds(15) - + Duration::milliseconds(100) - + Duration::microseconds(200); + fn test_human_combined() { + let expected = + Duration::days(1) + Duration::hours(2) + Duration::minutes(3) + Duration::seconds(4); check_ok( - parse_duration("1 day 2 hours 30 minutes 15 seconds 100 milliseconds 200 microseconds"), + parse_duration("1 day 2 hours 3 minutes 4 seconds"), expected, - "\"1 day 2 hours 30 minutes 15 seconds 100 milliseconds 200 microseconds\"", + "\"1 day 2 hours 3 minutes 4 seconds\"", ); } #[test] fn test_human_zero_duration_seconds() { - check_err_contains( + check_ok( parse_duration("0 seconds"), - "Invalid duration: 0 seconds", + Duration::zero(), "\"0 seconds\"", ); } #[test] fn test_human_zero_duration_days_hours() { - check_err_contains( + check_ok( parse_duration("0 day 0 hour"), - "Invalid duration: 0 day 0 hour", + Duration::zero(), "\"0 day 0 hour\"", ); } + #[test] + fn test_human_zero_duration_multiple_zeros() { + check_ok( + parse_duration("0 days 0 hours 0 minutes 0 seconds"), + Duration::zero(), + "\"0 days 0 hours 0 minutes 0 seconds\"", + ); + } + #[test] fn test_human_no_space_between_num_unit() { - check_err_contains(parse_duration("1day"), "Invalid number: 1day", "\"1day\""); + check_err_contains( + parse_duration("1day"), + "Invalid human-readable duration format in: 1day", + "\"1day\"", + ); + } + + #[test] + fn test_human_trimmed() { + check_ok(parse_duration(" 1 day "), Duration::days(1), "\" 1 day \""); } #[test] @@ -420,10 +731,6 @@ mod tests { ); } - #[test] - fn test_human_just_unit_no_number() { - check_err_contains(parse_duration("day"), "Invalid number: day", "\"day\""); - } #[test] fn test_human_negative_numbers() { check_ok( @@ -437,19 +744,4 @@ mod tests { "\"1 day -2 hours\"", ); } - - #[test] - fn test_iso_negative_numbers() { - check_ok(parse_duration("P-1D"), Duration::days(-1), "\"P-1D\""); - check_ok( - parse_duration("P-1H-2M"), - Duration::hours(-1) + Duration::minutes(-2), - "\"P-1H-2M\"", - ); - check_err_contains( - parse_duration("P-1-2H"), - "Invalid unit in duration: -", - "\"P-1-2H\"", - ); - } } From 7f108833f575ff803f8d8690599a40eb8901162c Mon Sep 17 00:00:00 2001 From: lemorage Date: Mon, 19 May 2025 12:03:45 +0200 Subject: [PATCH 3/4] feat: make duration parser lenient by allowing duplicate and out of order units --- docs/docs/core/data_types.mdx | 2 +- python/cocoindex/typing.py | 1 - src/base/duration.rs | 192 +++++++++++----------------------- 3 files changed, 63 insertions(+), 132 deletions(-) diff --git a/docs/docs/core/data_types.mdx b/docs/docs/core/data_types.mdx index 18a09bf..9329db3 100644 --- a/docs/docs/core/data_types.mdx +++ b/docs/docs/core/data_types.mdx @@ -35,7 +35,7 @@ This is the list of all basic types supported by CocoIndex: | Time | | `datetime.time` | `datetime.time` | | LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` | | OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` | -| TimeDelta | A duration of time | `datetime.TimeDelta` | `datetime.timedelta` | +| TimeDelta | A duration of time | `datetime.timedelta` | `datetime.timedelta` | | Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` | | Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package | diff --git a/python/cocoindex/typing.py b/python/cocoindex/typing.py index 90637f7..dca0d19 100644 --- a/python/cocoindex/typing.py +++ b/python/cocoindex/typing.py @@ -29,7 +29,6 @@ def __init__(self, key: str, value: Any): Json = Annotated[Any, TypeKind('Json')] LocalDateTime = Annotated[datetime.datetime, TypeKind('LocalDateTime')] OffsetDateTime = Annotated[datetime.datetime, TypeKind('OffsetDateTime')] -TimeDelta = Annotated[datetime.timedelta, TypeKind('TimeDelta')] if TYPE_CHECKING: T_co = TypeVar('T_co', covariant=True) diff --git a/src/base/duration.rs b/src/base/duration.rs index 8719b47..c71dcba 100644 --- a/src/base/duration.rs +++ b/src/base/duration.rs @@ -1,4 +1,4 @@ -use anyhow::{Result, anyhow}; +use anyhow::{Result, anyhow, bail}; use chrono::Duration; /// Parses a string of number-unit pairs into a vector of (number, unit), @@ -30,7 +30,7 @@ fn parse_components( result.push((num, unit)); iter.next(); } else { - return Err(anyhow!("Invalid unit '{}' in: {}", unit, original_input)); + bail!("Invalid unit '{}' in: {}", unit, original_input); } } else { return Err(anyhow!( @@ -43,22 +43,6 @@ fn parse_components( Ok(result) } -/// Checks if the sequence of units follows the specified order. -fn check_order(units: &[char], order: &[char], input: &str) -> Result<()> { - let mut last_pos = -1; - for &unit in units { - if let Some(pos) = order.iter().position(|&u| u == unit) { - if pos as i32 <= last_pos { - return Err(anyhow!("Units out of order in: {}", input)); - } - last_pos = pos as i32; - } else { - return Err(anyhow!("Invalid unit '{}' in: {}", unit, input)); - } - } - Ok(()) -} - /// Parses an ISO 8601 duration string into a `chrono::Duration`. fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { let (is_negative, s_after_sign) = if s.starts_with('-') { @@ -98,38 +82,6 @@ fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { vec![] }; - // Duplicate units not allowed in date part - let mut seen_date_units = std::collections::HashSet::new(); - for &(_, unit) in &date_components { - if !seen_date_units.insert(unit) { - return Err(anyhow!( - "Duplicate '{}' in date part: {}", - unit, - original_input - )); - } - } - - // Duplicate units not allowed in time part - let mut seen_time_units = std::collections::HashSet::new(); - for &(_, unit) in &time_components { - if !seen_time_units.insert(unit) { - return Err(anyhow!( - "Duplicate '{}' in time part: {}", - unit, - original_input - )); - } - } - - // Check date units are in order - let date_units: Vec = date_components.iter().map(|&(_, u)| u).collect(); - check_order(&date_units, &['Y', 'M', 'W', 'D'], original_input)?; - - // Check time units are in order - let time_units: Vec = time_components.iter().map(|&(_, u)| u).collect(); - check_order(&time_units, &['H', 'M', 'S'], original_input)?; - if date_components.is_empty() && time_components.is_empty() { return Err(anyhow!("No components in duration: {}", original_input)); } @@ -175,42 +127,26 @@ fn parse_human_readable_duration(s: &str, original_input: &str) -> Result = parts + let durations: Result> = parts .chunks(2) .map(|chunk| { - let num_str = chunk[0]; - let num = num_str + let num: i64 = chunk[0] .parse() - .map_err(|_| anyhow!("Invalid number '{}' in: {}", num_str, original_input))?; - Ok((num, chunk[1])) - }) - .collect::>>()?; - - let units: Vec<&str> = components.iter().map(|&(_, u)| u).collect(); - check_order( - &units - .iter() - .map(|u| u.to_lowercase().chars().next().unwrap()) - .collect::>(), - &['d', 'h', 'm', 's', 'i', 'c'], // Abbreviation of day, hour, minute, second, millisecond, microsecond - original_input, - )?; - - let total = components - .iter() - .fold(Duration::zero(), |acc, &(num, unit)| { - match unit.to_lowercase().as_str() { - "day" | "days" => acc + Duration::days(num), - "hour" | "hours" => acc + Duration::hours(num), - "minute" | "minutes" => acc + Duration::minutes(num), - "second" | "seconds" => acc + Duration::seconds(num), - "millisecond" | "milliseconds" => acc + Duration::milliseconds(num), - "microsecond" | "microseconds" => acc + Duration::microseconds(num), - _ => unreachable!("Invalid unit should be caught by prior validation"), + .map_err(|_| anyhow!("Invalid number '{}' in: {}", chunk[0], original_input))?; + + match chunk[1].to_lowercase().as_str() { + "day" | "days" => Ok(Duration::days(num)), + "hour" | "hours" => Ok(Duration::hours(num)), + "minute" | "minutes" => Ok(Duration::minutes(num)), + "second" | "seconds" => Ok(Duration::seconds(num)), + "millisecond" | "milliseconds" => Ok(Duration::milliseconds(num)), + "microsecond" | "microseconds" => Ok(Duration::microseconds(num)), + _ => bail!("Invalid unit '{}' in: {}", chunk[1], original_input), } - }); + }) + .collect(); - Ok(total) + durations.map(|durs| durs.into_iter().sum()) } /// Parses a duration string into a `chrono::Duration`, trying ISO 8601 first, then human-readable format. @@ -230,13 +166,10 @@ pub fn parse_duration(s: &str) -> Result { } if is_likely_iso8601(s) { - // Try ISO 8601 format without fallbacks - return parse_iso8601_duration(s, original_input); + parse_iso8601_duration(s, original_input) + } else { + parse_human_readable_duration(s, original_input) } - - // Allow maybe ISO-ish formats, so still try iso8601 first - parse_iso8601_duration(s, original_input) - .or_else(|_| parse_human_readable_duration(s, original_input)) } #[cfg(test)] @@ -385,39 +318,6 @@ mod tests { ); } - #[test] - fn test_iso_out_of_order_unit() { - check_err_contains( - parse_duration("P1H2D"), - "Invalid unit 'H' in: P1H2D", // Why not units out of order - "\"P1H2D\"", - ); - check_err_contains( - parse_duration("PT2S1H"), - "Units out of order in: PT2S1H", - "\"PT2S1H\"", - ); - check_err_contains( - parse_duration("P1W1Y"), - "Units out of order in: P1W1Y", - "\"P1W1Y\"", - ); - } - - #[test] - fn test_iso_duplicated_unit() { - check_err_contains( - parse_duration("P1D1D"), - "Duplicate 'D' in date part: P1D1D", - "\"P1D1D\"", - ); - check_err_contains( - parse_duration("PT1H1H"), - "Duplicate 'H' in time part: PT1H1H", - "\"PT1H1H\"", - ); - } - #[test] fn test_iso_negative_number_after_p() { check_err_contains( @@ -476,6 +376,33 @@ mod tests { ); } + #[test] + fn test_iso_duplicated_unit() { + check_ok(parse_duration("P1D1D"), Duration::days(2), "\"P1D1D\""); + check_ok(parse_duration("PT1H1H"), Duration::hours(2), "\"PT1H1H\""); + } + + #[test] + fn test_iso_out_of_order_unit() { + check_ok( + parse_duration("P1W1Y"), + Duration::days(365 + 7), + "\"P1W1Y\"", + ); + check_ok( + parse_duration("PT2S1H"), + Duration::hours(1) + Duration::seconds(2), + "\"PT2S1H\"", + ); + check_ok(parse_duration("P3M"), Duration::days(90), "\"PT2S1H\""); + check_ok(parse_duration("PT3M"), Duration::minutes(3), "\"PT2S1H\""); + check_err_contains( + parse_duration("P1H2D"), + "Invalid unit 'H' in: P1H2D", // Time part without 'T' is invalid + "\"P1H2D\"", + ); + } + #[test] fn test_iso_negative_duration_p1d() { check_ok(parse_duration("-P1D"), -Duration::days(1), "\"-P1D\""); @@ -542,15 +469,6 @@ mod tests { ); } - #[test] - fn test_human_out_of_order() { - check_err_contains( - parse_duration("1 second 2 hours"), - "Units out of order in: 1 second 2 hours", - "\"1 second 2 hours\"", - ); - } - #[test] fn test_human_float_number_fail() { check_err_contains( @@ -573,7 +491,7 @@ mod tests { fn test_human_unknown_unit() { check_err_contains( parse_duration("1 year"), - "Invalid unit 'y' in: 1 year", + "Invalid unit 'year' in: 1 year", "\"1 year\"", ); } @@ -681,6 +599,20 @@ mod tests { ); } + #[test] + fn test_human_out_of_order() { + check_ok( + parse_duration("1 second 2 hours"), + Duration::hours(2) + Duration::seconds(1), + "\"1 second 2 hours\"", + ); + check_ok( + parse_duration("7 minutes 6 hours 5 days"), + Duration::days(5) + Duration::hours(6) + Duration::minutes(7), + "\"7 minutes 6 hours 5 days\"", + ) + } + #[test] fn test_human_zero_duration_seconds() { check_ok( From 9e7e1b208d40dd2c3f622f9d4d931002c2e464d2 Mon Sep 17 00:00:00 2001 From: lemorage Date: Tue, 20 May 2025 05:00:29 +0200 Subject: [PATCH 4/4] refactor: inline code with some cleanups --- src/base/duration.rs | 37 ++++++++++++++++-------------------- src/ops/storages/postgres.rs | 3 --- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/src/base/duration.rs b/src/base/duration.rs index c71dcba..dd18ea5 100644 --- a/src/base/duration.rs +++ b/src/base/duration.rs @@ -20,7 +20,7 @@ fn parse_components( } } if num_str.is_empty() { - return Err(anyhow!("Expected number in: {}", original_input)); + bail!("Expected number in: {}", original_input); } let num = num_str .parse() @@ -33,11 +33,11 @@ fn parse_components( bail!("Invalid unit '{}' in: {}", unit, original_input); } } else { - return Err(anyhow!( + bail!( "Missing unit after number '{}' in: {}", num_str, original_input - )); + ); } } Ok(result) @@ -52,10 +52,7 @@ fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { }; if !s_after_sign.starts_with('P') { - return Err(anyhow!( - "Duration must start with 'P' in: {}", - original_input - )); + bail!("Duration must start with 'P' in: {}", original_input); } let s_after_p = &s_after_sign[1..]; @@ -72,10 +69,10 @@ fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { let time_components = if let Some(time_str) = time_part { let comps = parse_components(time_str, &['H', 'M', 'S'], original_input)?; if comps.is_empty() { - return Err(anyhow!( + bail!( "Time part present but no time components in: {}", original_input - )); + ); } comps } else { @@ -83,7 +80,7 @@ fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { }; if date_components.is_empty() && time_components.is_empty() { - return Err(anyhow!("No components in duration: {}", original_input)); + bail!("No components in duration: {}", original_input); } // Accumulate date duration @@ -121,10 +118,10 @@ fn parse_iso8601_duration(s: &str, original_input: &str) -> Result { fn parse_human_readable_duration(s: &str, original_input: &str) -> Result { let parts: Vec<&str> = s.split_whitespace().collect(); if parts.is_empty() || parts.len() % 2 != 0 { - return Err(anyhow!( + bail!( "Invalid human-readable duration format in: {}", original_input - )); + ); } let durations: Result> = parts @@ -154,18 +151,16 @@ pub fn parse_duration(s: &str) -> Result { let original_input = s; let s = s.trim(); if s.is_empty() { - return Err(anyhow!("Empty duration string")); + bail!("Empty duration string"); } - fn is_likely_iso8601(s: &str) -> bool { - match s.as_bytes() { - [c, ..] if c.eq_ignore_ascii_case(&b'P') => true, - [b'-', c, ..] if c.eq_ignore_ascii_case(&b'P') => true, - _ => false, - } - } + let is_likely_iso8601 = match s.as_bytes() { + [c, ..] if c.eq_ignore_ascii_case(&b'P') => true, + [b'-', c, ..] if c.eq_ignore_ascii_case(&b'P') => true, + _ => false, + }; - if is_likely_iso8601(s) { + if is_likely_iso8601 { parse_iso8601_duration(s, original_input) } else { parse_human_readable_duration(s, original_input) diff --git a/src/ops/storages/postgres.rs b/src/ops/storages/postgres.rs index a6f163e..30e0b59 100644 --- a/src/ops/storages/postgres.rs +++ b/src/ops/storages/postgres.rs @@ -220,9 +220,6 @@ fn from_pg_value(row: &PgRow, field_idx: usize, typ: &ValueType) -> Result row .try_get::>, _>(field_idx)? .map(BasicValue::OffsetDateTime), - // BasicValueType::TimeDelta => row - // .try_get::, _>(field_idx)? - // .map(BasicValue::TimeDelta), BasicValueType::TimeDelta => row .try_get::, _>(field_idx)? .map(|pg_interval| {