From 25eaf60bb89c22428017e9a35c0c93ce6756d733 Mon Sep 17 00:00:00 2001 From: Fenke Meijer Date: Thu, 27 Jun 2024 11:53:09 +0200 Subject: [PATCH] mostly docstrings --- corebridge/__init__.py | 2 +- corebridge/core.py | 111 ++++++++++--- nbs/00_core.ipynb | 356 ++++++++++++++++++++++++++++++++++++----- settings.ini | 2 +- 4 files changed, 412 insertions(+), 59 deletions(-) diff --git a/corebridge/__init__.py b/corebridge/__init__.py index 11ef092..f3291e9 100644 --- a/corebridge/__init__.py +++ b/corebridge/__init__.py @@ -1 +1 @@ -__version__ = "0.2.13" +__version__ = "0.2.14" diff --git a/corebridge/core.py b/corebridge/core.py index c53a988..abe3928 100644 --- a/corebridge/core.py +++ b/corebridge/core.py @@ -19,6 +19,29 @@ # %% ../nbs/00_core.ipynb 7 def set_time_index_zone(df:pd.DataFrame, timezone): + """ + Sets the time zone of the index of a pandas DataFrame. + + Args: + df (pd.DataFrame): The DataFrame whose index time zone is to be set. + timezone (str): The desired time zone. + + Returns: + pd.DataFrame: The DataFrame with its index time zone set to the specified time zone. + + Raises: + None + + Examples: + >>> df = pd.DataFrame({'A': [1, 2, 3]}, index=pd.DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03'])) + >>> set_time_index_zone(df, 'Europe/Berlin') + A + 2022-01-01 1 + 2022-01-02 2 + 2022-01-03 3 + DatetimeIndex: 3 entries, 2022-01-01 01:00:00+01:00 to 2022-01-03 01:00:00+01:00 + """ + if isinstance(df.index, pd.DatetimeIndex): df.index.name = 'time' if not hasattr(df.index, 'tz') or not df.index.tz or not df.index.tz: @@ -29,13 +52,22 @@ def set_time_index_zone(df:pd.DataFrame, timezone): return df -# %% ../nbs/00_core.ipynb 9 +# %% ../nbs/00_core.ipynb 11 def timeseries_dataframe( data:typing.Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray], timezone='UTC', columnnames=None): - """Convert various tabular data formats to timeseries DataFrame""" + """Convert various tabular data formats to timeseries DataFrame + + Args: + data (Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray]): The input data to be converted. + timezone (str, optional): The timezone to set for the index of the DataFrame. Defaults to 'UTC'. + columnnames (Optional[List[str]]): The column names to use for the DataFrame. Defaults to None. + + Returns: + pd.DataFrame: The converted timeseries DataFrame with the index set to the specified timezone. + """ if isinstance(data, pd.DataFrame): df = data @@ -44,7 +76,7 @@ def timeseries_dataframe( df = pd.DataFrame(data) elif isinstance(data, dict): - # dict/mapping of individual timeseries + # assume a dict/mapping of individual arrays representing timeseries df = pd.DataFrame({ C:pd.Series(data=A[:,1], index=pd.DatetimeIndex(A[:,0]*1e9)) if isinstance(A, np.ndarray) else A for C,A in data.items() @@ -77,14 +109,21 @@ def timeseries_dataframe( return set_time_index_zone(df, timezone) -# %% ../nbs/00_core.ipynb 11 +# %% ../nbs/00_core.ipynb 14 def timeseries_dataframe_from_datadict( data:dict, timecolumns=None, recordformat='records'): - "Convert data dict to dataframe" - + """ + Converts a data dict into a pandas DataFrame based on the specified record format. + Parameters: + - data: A dictionary containing the data to convert. + - timecolumns: A list of column names to be treated as time columns. + - recordformat: A string specifying the format of the data records ('records', 'table', 'split', 'index', 'tight'). + Returns: + - df: A pandas DataFrame with a DatetimeIndex representing the converted data. + """ orient = recordformat.lower() assert orient in ['records', 'table', 'split', 'index', 'tight'] @@ -104,15 +143,25 @@ def timeseries_dataframe_from_datadict( df.columns = list(df.columns) df[time_column] = pd.to_datetime(df[time_column],utc=True,format='ISO8601') df.set_index(time_column, inplace=True) - #df.index = pd.DatetimeIndex(df.index).round('ms') + df.index = pd.DatetimeIndex(df.index).round('ms') df.index.name = 'time' return df -# %% ../nbs/00_core.ipynb 14 +# %% ../nbs/00_core.ipynb 17 def pop_nan_values(data): + """ + Recursively pop keys with nan values from dict or lists with dicts. + + Args: + data (Union[list, dict]): The data to be processed. + + Returns: + Union[list, dict]: The processed data with keys with nan values removed. + """ + if isinstance(data, list): return [pop_nan_values(v) for v in data if pd.notnull([v]).any()] elif isinstance(data, dict): @@ -120,13 +169,26 @@ def pop_nan_values(data): else: return data -# %% ../nbs/00_core.ipynb 15 +# %% ../nbs/00_core.ipynb 18 def timeseries_dataframe_to_datadict( data:typing.Union[pd.DataFrame, pd.Series, dict], recordformat:str='records', timezone:str='UTC', popNaN:bool=False): - + + """ + Convert a timeseries DataFrame or Series into a dictionary representation. + + Args: + data (Union[pd.DataFrame, pd.Series, dict]): The input data to be converted. It can be a pandas DataFrame, Series, or a dictionary. + recordformat (str, optional): The format of the output records. Defaults to 'records'. + timezone (str, optional): The timezone to use for the DataFrame index. Defaults to 'UTC'. + popNaN (bool, optional): Whether to remove NaN values from the output dictionary. Defaults to False. + + Returns: + Union[dict, list]: The converted dictionary representation of the input data. If `popNaN` is True, it returns a dictionary with NaN values removed. Otherwise, it returns a dictionary or a list of dictionaries depending on the `recordformat` parameter. + """ + orient = recordformat.lower() normalized_data = timeseries_dataframe(data, timezone=timezone) @@ -137,18 +199,17 @@ def timeseries_dataframe_to_datadict( records = normalized_data.reset_index().to_dict(orient='records') else: records = normalized_data.to_dict(orient=orient) - if popNaN and normalized_data.isna().any(axis=None): - #return pop_nan_values(records) - return [ {k:v for k,v in m.items() if pd.notnull(v)} for m in records] - else: - return records + return pop_nan_values(records) + + return records +# %% ../nbs/00_core.ipynb 25 +#def interpolate_timeseries(sampler, period, method_args): -# %% ../nbs/00_core.ipynb 22 ResamplerMethods = dict( count=lambda R: R.count(), median=lambda R: R.median(), @@ -158,10 +219,7 @@ def timeseries_dataframe_to_datadict( sum=lambda R: R.sum(), std=lambda R: R.std(), var=lambda R: R.var(), - cumsum=lambda R: R.cumsum(), - cummax=lambda R: R.cummax(), - cummin=lambda R: R.cummin(), - + nearest=lambda R: R.nearest(), ) ReSamplerPeriods = dict( @@ -169,7 +227,18 @@ def timeseries_dataframe_to_datadict( ) def timeseries_dataframe_resample(df:pd.DataFrame, period:str, method:str): - + """ + Resamples a time-series DataFrame on the specified period and method. + + Parameters: + df (pd.DataFrame): The input time-series DataFrame. + period (str): The resampling period. + method (str): The resampling method. Can be a string of multiple methods separated by ';'. + method_args (dict, optional): Additional arguments for the resampling method. + + Returns: + pd.DataFrame: The resampled DataFrame. + """ sampler = df.resample(ReSamplerPeriods.get(period, str(period))) dataframes = [df] diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb index 77160e7..6bf36d2 100644 --- a/nbs/00_core.ipynb +++ b/nbs/00_core.ipynb @@ -5,7 +5,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# AICore-Bridge\n", + "# Core functionality\n", "\n", "> Fill in a module description here" ] @@ -71,7 +71,7 @@ "source": [ "### `set_time_index_zone`\n", "\n", - "Utility to set the timezone on a datetime index" + "Processing may depend on proper timezone awareness, this utility to set the timezone on a datetime index" ] }, { @@ -83,6 +83,29 @@ "#| export\n", "\n", "def set_time_index_zone(df:pd.DataFrame, timezone):\n", + " \"\"\"\n", + " Sets the time zone of the index of a pandas DataFrame.\n", + "\n", + " Args:\n", + " df (pd.DataFrame): The DataFrame whose index time zone is to be set.\n", + " timezone (str): The desired time zone.\n", + "\n", + " Returns:\n", + " pd.DataFrame: The DataFrame with its index time zone set to the specified time zone.\n", + "\n", + " Raises:\n", + " None\n", + "\n", + " Examples:\n", + " >>> df = pd.DataFrame({'A': [1, 2, 3]}, index=pd.DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03']))\n", + " >>> set_time_index_zone(df, 'Europe/Berlin')\n", + " A\n", + " 2022-01-01 1\n", + " 2022-01-02 2\n", + " 2022-01-03 3\n", + " DatetimeIndex: 3 entries, 2022-01-01 01:00:00+01:00 to 2022-01-03 01:00:00+01:00\n", + " \"\"\"\n", + " \n", " if isinstance(df.index, pd.DatetimeIndex):\n", " df.index.name = 'time'\n", " if not hasattr(df.index, 'tz') or not df.index.tz or not df.index.tz:\n", @@ -93,6 +116,81 @@ " return df\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
A
time
2022-01-01 01:00:00+01:001
2022-01-02 01:00:00+01:002
2022-01-03 01:00:00+01:003
\n", + "
" + ], + "text/plain": [ + " A\n", + "time \n", + "2022-01-01 01:00:00+01:00 1\n", + "2022-01-02 01:00:00+01:00 2\n", + "2022-01-03 01:00:00+01:00 3" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'A': [1, 2, 3]}, index=pd.DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03']))\n", + "set_time_index_zone(df, 'Europe/Berlin')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -117,7 +215,16 @@ " timezone='UTC', \n", " columnnames=None):\n", " \n", - " \"\"\"Convert various tabular data formats to timeseries DataFrame\"\"\"\n", + " \"\"\"Convert various tabular data formats to timeseries DataFrame\n", + "\n", + " Args:\n", + " data (Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray]): The input data to be converted.\n", + " timezone (str, optional): The timezone to set for the index of the DataFrame. Defaults to 'UTC'.\n", + " columnnames (Optional[List[str]]): The column names to use for the DataFrame. Defaults to None.\n", + "\n", + " Returns:\n", + " pd.DataFrame: The converted timeseries DataFrame with the index set to the specified timezone.\n", + " \"\"\"\n", "\n", " if isinstance(data, pd.DataFrame):\n", " df = data\n", @@ -126,7 +233,7 @@ " df = pd.DataFrame(data)\n", "\n", " elif isinstance(data, dict):\n", - " # dict/mapping of individual timeseries\n", + " # assume a dict/mapping of individual arrays representing timeseries \n", " df = pd.DataFrame({\n", " C:pd.Series(data=A[:,1], index=pd.DatetimeIndex(A[:,0]*1e9)) if isinstance(A, np.ndarray) else A\n", " for C,A in data.items()\n", @@ -167,6 +274,13 @@ "outputs": [], "source": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `timeseries_dataframe_from_datadict`" + ] + }, { "cell_type": "code", "execution_count": null, @@ -179,8 +293,15 @@ " timecolumns=None,\n", " recordformat='records'):\n", " \n", - " \"Convert data dict to dataframe\"\n", - "\n", + " \"\"\"\n", + " Converts a data dict into a pandas DataFrame based on the specified record format. \n", + " Parameters:\n", + " - data: A dictionary containing the data to convert.\n", + " - timecolumns: A list of column names to be treated as time columns.\n", + " - recordformat: A string specifying the format of the data records ('records', 'table', 'split', 'index', 'tight').\n", + " Returns:\n", + " - df: A pandas DataFrame with a DatetimeIndex representing the converted data.\n", + " \"\"\"\n", " orient = recordformat.lower()\n", " assert orient in ['records', 'table', 'split', 'index', 'tight']\n", " \n", @@ -200,7 +321,7 @@ " df.columns = list(df.columns)\n", " df[time_column] = pd.to_datetime(df[time_column],utc=True,format='ISO8601')\n", " df.set_index(time_column, inplace=True)\n", - " #df.index = pd.DatetimeIndex(df.index).round('ms')\n", + " df.index = pd.DatetimeIndex(df.index).round('ms')\n", " \n", " df.index.name = 'time'\n", "\n", @@ -376,6 +497,16 @@ "#| export\n", "\n", "def pop_nan_values(data):\n", + " \"\"\"\n", + " Recursively pop keys with nan values from dict or lists with dicts.\n", + "\n", + " Args:\n", + " data (Union[list, dict]): The data to be processed.\n", + "\n", + " Returns:\n", + " Union[list, dict]: The processed data with keys with nan values removed.\n", + " \"\"\"\n", + " \n", " if isinstance(data, list):\n", " return [pop_nan_values(v) for v in data if pd.notnull([v]).any()]\n", " elif isinstance(data, dict):\n", @@ -396,7 +527,20 @@ " recordformat:str='records', \n", " timezone:str='UTC',\n", " popNaN:bool=False):\n", - " \n", + "\n", + " \"\"\"\n", + " Convert a timeseries DataFrame or Series into a dictionary representation.\n", + "\n", + " Args:\n", + " data (Union[pd.DataFrame, pd.Series, dict]): The input data to be converted. It can be a pandas DataFrame, Series, or a dictionary.\n", + " recordformat (str, optional): The format of the output records. Defaults to 'records'.\n", + " timezone (str, optional): The timezone to use for the DataFrame index. Defaults to 'UTC'.\n", + " popNaN (bool, optional): Whether to remove NaN values from the output dictionary. Defaults to False.\n", + "\n", + " Returns:\n", + " Union[dict, list]: The converted dictionary representation of the input data. If `popNaN` is True, it returns a dictionary with NaN values removed. Otherwise, it returns a dictionary or a list of dictionaries depending on the `recordformat` parameter.\n", + " \"\"\"\n", + " \n", " orient = recordformat.lower()\n", "\n", " normalized_data = timeseries_dataframe(data, timezone=timezone)\n", @@ -407,15 +551,11 @@ " records = normalized_data.reset_index().to_dict(orient='records')\n", " else:\n", " records = normalized_data.to_dict(orient=orient)\n", - " \n", "\n", " if popNaN and normalized_data.isna().any(axis=None):\n", - " #return pop_nan_values(records)\n", - " return [ {k:v for k,v in m.items() if pd.notnull(v)} for m in records]\n", - " else:\n", - " return records\n", - "\n", - "\n" + " return pop_nan_values(records)\n", + " \n", + " return records \n" ] }, { @@ -537,24 +677,9 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.notnull([[np.nan, 2]]).any()" - ] + "source": [] }, { "cell_type": "code", @@ -563,6 +688,10 @@ "outputs": [], "source": [ "#| export\n", + "\n", + "#def interpolate_timeseries(sampler, period, method_args):\n", + "\n", + "\n", "ResamplerMethods = dict(\n", " count=lambda R: R.count(),\n", " median=lambda R: R.median(),\n", @@ -572,10 +701,7 @@ " sum=lambda R: R.sum(),\n", " std=lambda R: R.std(),\n", " var=lambda R: R.var(),\n", - " cumsum=lambda R: R.cumsum(),\n", - " cummax=lambda R: R.cummax(),\n", - " cummin=lambda R: R.cummin(),\n", - "\n", + " nearest=lambda R: R.nearest(),\n", ")\n", "\n", "ReSamplerPeriods = dict(\n", @@ -583,7 +709,18 @@ ")\n", "\n", "def timeseries_dataframe_resample(df:pd.DataFrame, period:str, method:str):\n", + " \"\"\"\n", + " Resamples a time-series DataFrame on the specified period and method.\n", "\n", + " Parameters:\n", + " df (pd.DataFrame): The input time-series DataFrame.\n", + " period (str): The resampling period.\n", + " method (str): The resampling method. Can be a string of multiple methods separated by ';'.\n", + " method_args (dict, optional): Additional arguments for the resampling method.\n", + "\n", + " Returns:\n", + " pd.DataFrame: The resampled DataFrame.\n", + " \"\"\"\n", " sampler = df.resample(ReSamplerPeriods.get(period, str(period)))\n", "\n", " dataframes = [df]\n", @@ -596,6 +733,153 @@ "\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = timeseries_dataframe_from_datadict([\n", + " {\n", + " \"time\":\"2023-05-04T10:04:49.000Z\",\n", + " \"value\":16.72\n", + " },\n", + " {\n", + " \"time\":\"2023-05-04T10:24:51.000Z\",\n", + " \"value\":16.65\n", + " },\n", + " {\n", + " \"time\":\"2023-05-04T10:44:53.000Z\",\n", + " \"value\":16.55\n", + " },\n", + " {\n", + " \"time\":\"2023-05-04T11:04:49.000Z\",\n", + " \"value\":16.47\n", + " },\n", + " {\n", + " \"time\":\"2023-05-04T11:24:51.000Z\",\n", + " \"value\":16.44\n", + " },\n", + " {\n", + " \"time\":\"2023-05-04T11:44:53.000Z\",\n", + " \"value\":16.38\n", + " },\n", + " ], timecolumns=['time'])\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
valuevalue_meanvalue_count
time
2023-05-04 09:20:00+00:00NaN16.6852.0
2023-05-04 10:04:49+00:0016.72NaNNaN
2023-05-04 10:24:51+00:0016.65NaNNaN
2023-05-04 10:40:00+00:00NaN16.4604.0
2023-05-04 10:44:53+00:0016.55NaNNaN
2023-05-04 11:04:49+00:0016.47NaNNaN
2023-05-04 11:24:51+00:0016.44NaNNaN
2023-05-04 11:44:53+00:0016.38NaNNaN
\n", + "
" + ], + "text/plain": [ + " value value_mean value_count\n", + "time \n", + "2023-05-04 09:20:00+00:00 NaN 16.685 2.0\n", + "2023-05-04 10:04:49+00:00 16.72 NaN NaN\n", + "2023-05-04 10:24:51+00:00 16.65 NaN NaN\n", + "2023-05-04 10:40:00+00:00 NaN 16.460 4.0\n", + "2023-05-04 10:44:53+00:00 16.55 NaN NaN\n", + "2023-05-04 11:04:49+00:00 16.47 NaN NaN\n", + "2023-05-04 11:24:51+00:00 16.44 NaN NaN\n", + "2023-05-04 11:44:53+00:00 16.38 NaN NaN" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "timeseries_dataframe_resample(df, \"80min\", 'mean;count')" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/settings.ini b/settings.ini index 2683f38..3306615 100644 --- a/settings.ini +++ b/settings.ini @@ -5,7 +5,7 @@ ### Python library ### repo = corebridge lib_name = %(repo)s -version = 0.2.13 +version = 0.2.14 min_python = 3.7 license = apache2 black_formatting = False