Add path statistics

lsbardel · lsbardel · commit 7bdaaff03850 · 2024-12-28T16:18:31.000Z
diff --git a/notebooks/applications/hurst.md b/notebooks/applications/hurst.md
@@ -17,32 +17,49 @@ The [Hurst exponent](https://en.wikipedia.org/wiki/Hurst_exponent) is used as a
 
 It is a statistics which can be used to test if a time-series is mean reverting or it is trending.
 
-```{code-cell} ipython3
-from quantflow.sp.cir import CIR
++++
 
-p = CIR(kappa=1, sigma=1)
-```
+## Study the Weiner process OHLC
 
-## Study the Weiner process OHLC 
+We want to construct a mechanism to estimate the hurst exponent via OHLC data.
+In order to evaluate results agains known solution we take the Weiner process as generator of timeseries. In this way we know exactly what the variance should be.
 
 ```{code-cell} ipython3
 from quantflow.sp.weiner import WeinerProcess
+from quantflow.utils.dates import start_of_day
 p = WeinerProcess(sigma=0.5)
-paths = p.sample(1, 1, 1000)
-df = paths.as_datetime_df().reset_index()
+paths = p.sample(1, 1, 24*60*60)
+paths.plot()
+```
+
+```{code-cell} ipython3
+df = paths.as_datetime_df(start=start_of_day()).reset_index()
 df
 ```
 
+At this point we estimate the standard deviation using the **realized variance** along the path (we use the scaled flag so that the standard deviation is caled by the square-root of time step)
+
+```{code-cell} ipython3
+float(paths.path_std(scaled=True)[0])
+```
+
 ```{code-cell} ipython3
 from quantflow.ta.ohlc import OHLC
+from dataclasses import replace
 from datetime import timedelta
 ohlc = OHLC(serie="0", period="10m", rogers_satchell_variance=True, parkinson_variance=True, garman_klass_variance=True)
-result = ohlc(df)
-result
+ohlc(df)
 ```
 
 ```{code-cell} ipython3
-
+import pandas as pd
+results = []
+for period in ("2m", "5m", "10m", "30m", "1h", "4h"):
+    operator = ohlc.model_copy(update=dict(period=period))
+    result = operator(df).sum()
+    results.append(dict(period=period, pk=result["0_pk"].item(), gk=result["0_gk"].item(), rs=result["0_rs"].item()))
+vdf = pd.DataFrame(results)
+vdf
 ```
 
 # Links
@@ -66,6 +83,10 @@ from quantflow.utils.dates import utcnow
 pd.date_range(start=utcnow(), periods=10, freq="0.5S")
 ```
 
+```{code-cell} ipython3
+7*7+3*3
+```
+
 ```{code-cell} ipython3
 
 ```
diff --git a/quantflow/data/vault.py b/quantflow/data/vault.py
@@ -2,6 +2,7 @@
 
 
 class Vault:
+    """Keeps key-value pairs in a file."""
 
     def __init__(self, path: str | Path) -> None:
         self.path = Path(path)
@@ -17,22 +18,27 @@ def load(self) -> dict[str, str]:
         return data
 
     def add(self, key: str, value: str) -> None:
+        """Add a key-value pair to the vault."""
         self.data[key] = value
         self.save()
 
     def delete(self, key: str) -> bool:
+        """Delete a key-value pair from the vault."""
         if self.data.pop(key, None) is not None:
             self.save()
             return True
         return False
 
     def get(self, key: str) -> str | None:
+        """Get the value of a key if available otherwise None."""
         return self.data.get(key)
 
     def keys(self) -> list[str]:
+        """Get the keys in the vault."""
         return sorted(self.data)
 
     def save(self) -> None:
+        """Save the data to the file."""
         with open(self.path, "w") as file:
             for key in sorted(self.data):
                 value = self.data[key]
diff --git a/quantflow/ta/base.py b/quantflow/ta/base.py
@@ -6,7 +6,9 @@
 DataFrame: TypeAlias = pl.DataFrame | pd.DataFrame
 
 
-def to_polars(df: DataFrame) -> pl.DataFrame:
+def to_polars(df: DataFrame, *, copy: bool = False) -> pl.DataFrame:
     if isinstance(df, pd.DataFrame):
         return pl.DataFrame(df)
+    elif copy:
+        return df.clone()
     return df
diff --git a/quantflow/ta/ohlc.py b/quantflow/ta/ohlc.py
@@ -1,14 +1,13 @@
-from dataclasses import dataclass
 from datetime import timedelta
 
 import numpy as np
 import polars as pl
+from pydantic import BaseModel
 
 from .base import DataFrame, to_polars
 
 
-@dataclass
-class OHLC:
+class OHLC(BaseModel):
     """Aggregates OHLC data over a given period and serie
 
     Optionally calculates the range-based variance estimators for the serie.
@@ -50,7 +49,7 @@ def close_col(self) -> pl.Expr:
     def __call__(self, df: DataFrame) -> pl.DataFrame:
         """Returns a dataframe with OHLC data sampled over the given period"""
         result = (
-            to_polars(df)
+            to_polars(df, copy=True)
             .group_by_dynamic(self.index_column, every=self.period)
             .agg(
                 pl.col(self.serie).first().alias(f"{self.serie}_open"),
diff --git a/quantflow/utils/dates.py b/quantflow/utils/dates.py
@@ -5,7 +5,20 @@ def utcnow() -> datetime:
     return datetime.now(timezone.utc)
 
 
+def as_utc(dt: date | None = None) -> datetime:
+    if dt is None:
+        return utcnow()
+    elif isinstance(dt, datetime):
+        return dt.astimezone(timezone.utc)
+    else:
+        return datetime(dt.year, dt.month, dt.day, tzinfo=timezone.utc)
+
+
 def isoformat(date: str | date) -> str:
     if isinstance(date, str):
         return date
     return date.isoformat()
+
+
+def start_of_day(dt: date | None = None) -> datetime:
+    return as_utc(dt).replace(hour=0, minute=0, second=0, microsecond=0)
diff --git a/quantflow/utils/paths.py b/quantflow/utils/paths.py
@@ -19,10 +19,13 @@ class Paths(BaseModel, arbitrary_types_allowed=True):
     """Paths of a stochastic process"""
 
     t: float = Field(description="time horizon")
+    """Time horizon - the unit of time is not specified"""
     data: FloatArray = Field(description="paths")
+    """Paths of the stochastic process"""
 
     @property
     def dt(self) -> float:
+        """Time step"""
         return self.t / self.time_steps
 
     @property
@@ -64,17 +67,42 @@ def dates(
         return pd.date_range(start=start, end=end, periods=self.time_steps + 1)
 
     def mean(self) -> FloatArray:
-        """Mean of paths"""
+        """Paths cross-section mean"""
         return np.mean(self.data, axis=1)
 
     def std(self) -> FloatArray:
-        """Standard deviation of paths"""
+        """Paths cross-section standard deviation"""
         return np.std(self.data, axis=1)
 
     def var(self) -> FloatArray:
-        """Variance of paths"""
+        """Paths cross-section variance"""
         return np.var(self.data, axis=1)
 
+    def paths_mean(self, *, scaled: bool = False) -> FloatArray:
+        """mean for each path
+
+        If scaled is True, the mean is scaled by the time step
+        """
+        scale = self.dt if scaled else 1.0
+        return np.mean(self.data, axis=0) / scale
+
+    def path_std(self, *, scaled: bool = False) -> FloatArray:
+        """standard deviation for each path
+
+        If scaled is True, the standard deviation is scaled by the square
+        root of the time step
+        """
+        scale = np.sqrt(self.dt) if scaled else 1.0
+        return np.std(np.diff(self.data, axis=0), axis=0) / scale
+
+    def path_var(self, *, scaled: bool = False) -> FloatArray:
+        """variance for each path
+
+        If scaled is True, the variance is scaled by the time step
+        """
+        scale = self.dt if scaled else 1.0
+        return np.var(np.diff(self.data, axis=0), axis=0) / scale
+
     def as_datetime_df(
         self, *, start: datetime | None = None, unit: str = "d"
     ) -> pd.DataFrame:
diff --git a/quantflow/utils/volatility.py b/quantflow/utils/volatility.py
diff --git a/quantflow_tests/test_utils.py b/quantflow_tests/test_utils.py
@@ -0,0 +1,11 @@
+from quantflow.utils.numbers import round_to_step, to_decimal
+
+
+def test_round_to_step():
+    assert str(round_to_step(1.234, 0.1)) == "1.2"
+    assert str(round_to_step(1.234, 0.01)) == "1.23"
+    assert str(round_to_step(1.236, 0.01)) == "1.24"
+    assert str(round_to_step(1.1, 0.01)) == "1.10"
+    assert str(round_to_step(1.1, 0.001)) == "1.100"
+    assert str(round_to_step(2, 0.001)) == "2.000"
+    assert str(round_to_step(to_decimal("2.00000000000"), 0.001)) == "2.000"