Skip to content

Commit

Permalink
mostly docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
fenke committed Jun 27, 2024
1 parent b3c3274 commit 25eaf60
Show file tree
Hide file tree
Showing 4 changed files with 412 additions and 59 deletions.
2 changes: 1 addition & 1 deletion corebridge/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.13"
__version__ = "0.2.14"
111 changes: 90 additions & 21 deletions corebridge/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,29 @@

# %% ../nbs/00_core.ipynb 7
def set_time_index_zone(df:pd.DataFrame, timezone):
"""
Sets the time zone of the index of a pandas DataFrame.
Args:
df (pd.DataFrame): The DataFrame whose index time zone is to be set.
timezone (str): The desired time zone.
Returns:
pd.DataFrame: The DataFrame with its index time zone set to the specified time zone.
Raises:
None
Examples:
>>> df = pd.DataFrame({'A': [1, 2, 3]}, index=pd.DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03']))
>>> set_time_index_zone(df, 'Europe/Berlin')
A
2022-01-01 1
2022-01-02 2
2022-01-03 3
DatetimeIndex: 3 entries, 2022-01-01 01:00:00+01:00 to 2022-01-03 01:00:00+01:00
"""

if isinstance(df.index, pd.DatetimeIndex):
df.index.name = 'time'
if not hasattr(df.index, 'tz') or not df.index.tz or not df.index.tz:
Expand All @@ -29,13 +52,22 @@ def set_time_index_zone(df:pd.DataFrame, timezone):
return df


# %% ../nbs/00_core.ipynb 9
# %% ../nbs/00_core.ipynb 11
def timeseries_dataframe(
data:typing.Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray],
timezone='UTC',
columnnames=None):

"""Convert various tabular data formats to timeseries DataFrame"""
"""Convert various tabular data formats to timeseries DataFrame
Args:
data (Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray]): The input data to be converted.
timezone (str, optional): The timezone to set for the index of the DataFrame. Defaults to 'UTC'.
columnnames (Optional[List[str]]): The column names to use for the DataFrame. Defaults to None.
Returns:
pd.DataFrame: The converted timeseries DataFrame with the index set to the specified timezone.
"""

if isinstance(data, pd.DataFrame):
df = data
Expand All @@ -44,7 +76,7 @@ def timeseries_dataframe(
df = pd.DataFrame(data)

elif isinstance(data, dict):
# dict/mapping of individual timeseries
# assume a dict/mapping of individual arrays representing timeseries
df = pd.DataFrame({
C:pd.Series(data=A[:,1], index=pd.DatetimeIndex(A[:,0]*1e9)) if isinstance(A, np.ndarray) else A
for C,A in data.items()
Expand Down Expand Up @@ -77,14 +109,21 @@ def timeseries_dataframe(

return set_time_index_zone(df, timezone)

# %% ../nbs/00_core.ipynb 11
# %% ../nbs/00_core.ipynb 14
def timeseries_dataframe_from_datadict(
data:dict,
timecolumns=None,
recordformat='records'):

"Convert data dict to dataframe"

"""
Converts a data dict into a pandas DataFrame based on the specified record format.
Parameters:
- data: A dictionary containing the data to convert.
- timecolumns: A list of column names to be treated as time columns.
- recordformat: A string specifying the format of the data records ('records', 'table', 'split', 'index', 'tight').
Returns:
- df: A pandas DataFrame with a DatetimeIndex representing the converted data.
"""
orient = recordformat.lower()
assert orient in ['records', 'table', 'split', 'index', 'tight']

Expand All @@ -104,29 +143,52 @@ def timeseries_dataframe_from_datadict(
df.columns = list(df.columns)
df[time_column] = pd.to_datetime(df[time_column],utc=True,format='ISO8601')
df.set_index(time_column, inplace=True)
#df.index = pd.DatetimeIndex(df.index).round('ms')
df.index = pd.DatetimeIndex(df.index).round('ms')

df.index.name = 'time'

return df


# %% ../nbs/00_core.ipynb 14
# %% ../nbs/00_core.ipynb 17
def pop_nan_values(data):
"""
Recursively pop keys with nan values from dict or lists with dicts.
Args:
data (Union[list, dict]): The data to be processed.
Returns:
Union[list, dict]: The processed data with keys with nan values removed.
"""

if isinstance(data, list):
return [pop_nan_values(v) for v in data if pd.notnull([v]).any()]
elif isinstance(data, dict):
return {k:pop_nan_values(v) for k, v in data.items() if pd.notnull([v]).any()}
else:
return data

# %% ../nbs/00_core.ipynb 15
# %% ../nbs/00_core.ipynb 18
def timeseries_dataframe_to_datadict(
data:typing.Union[pd.DataFrame, pd.Series, dict],
recordformat:str='records',
timezone:str='UTC',
popNaN:bool=False):


"""
Convert a timeseries DataFrame or Series into a dictionary representation.
Args:
data (Union[pd.DataFrame, pd.Series, dict]): The input data to be converted. It can be a pandas DataFrame, Series, or a dictionary.
recordformat (str, optional): The format of the output records. Defaults to 'records'.
timezone (str, optional): The timezone to use for the DataFrame index. Defaults to 'UTC'.
popNaN (bool, optional): Whether to remove NaN values from the output dictionary. Defaults to False.
Returns:
Union[dict, list]: The converted dictionary representation of the input data. If `popNaN` is True, it returns a dictionary with NaN values removed. Otherwise, it returns a dictionary or a list of dictionaries depending on the `recordformat` parameter.
"""

orient = recordformat.lower()

normalized_data = timeseries_dataframe(data, timezone=timezone)
Expand All @@ -137,18 +199,17 @@ def timeseries_dataframe_to_datadict(
records = normalized_data.reset_index().to_dict(orient='records')
else:
records = normalized_data.to_dict(orient=orient)


if popNaN and normalized_data.isna().any(axis=None):
#return pop_nan_values(records)
return [ {k:v for k,v in m.items() if pd.notnull(v)} for m in records]
else:
return records
return pop_nan_values(records)

return records


# %% ../nbs/00_core.ipynb 25
#def interpolate_timeseries(sampler, period, method_args):


# %% ../nbs/00_core.ipynb 22
ResamplerMethods = dict(
count=lambda R: R.count(),
median=lambda R: R.median(),
Expand All @@ -158,18 +219,26 @@ def timeseries_dataframe_to_datadict(
sum=lambda R: R.sum(),
std=lambda R: R.std(),
var=lambda R: R.var(),
cumsum=lambda R: R.cumsum(),
cummax=lambda R: R.cummax(),
cummin=lambda R: R.cummin(),

nearest=lambda R: R.nearest(),
)

ReSamplerPeriods = dict(
H='h', T='min', S='sec', L='ms', U='us', N='ns'
)

def timeseries_dataframe_resample(df:pd.DataFrame, period:str, method:str):

"""
Resamples a time-series DataFrame on the specified period and method.
Parameters:
df (pd.DataFrame): The input time-series DataFrame.
period (str): The resampling period.
method (str): The resampling method. Can be a string of multiple methods separated by ';'.
method_args (dict, optional): Additional arguments for the resampling method.
Returns:
pd.DataFrame: The resampled DataFrame.
"""
sampler = df.resample(ReSamplerPeriods.get(period, str(period)))

dataframes = [df]
Expand Down
Loading

0 comments on commit 25eaf60

Please sign in to comment.