Merge branch '0.2' into master

kylerbrown · Feb 24, 2017 · d6756c2 · d6756c2
2 parents 55d71ff + cdb29a0
commit d6756c2
Show file tree

Hide file tree

Showing 21 changed files with 835 additions and 327 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,8 @@ directories, plain text files and simple binary arrays, Bark data can leverage a
 Bark is also the fibrous outer layer of [ARF](https://github.com/melizalab/arf), wrapped around a few standard
 file types.
 
-Finally **BARK** is also an acronym for **B**ark is **A**rf **R**einterpreted by **K**yler.
+**BARK** is also an acronym for **B**ark is **A**rf **R**einterpreted by **K**yler.
+
 
 ## Why use Bark instead of ARF?
 
@@ -36,15 +37,12 @@ from the specification, but gives Bark a few advantages:
 ## The elements of Bark
 Bark trees are made from the following elements:
 
-- A **Root** directory grouping a set of Entries together. This is a standard
-  filesystem directory containing one file named "meta", which contains top-
-  level metadata, and any number of Entry subdirectories.
 - **Entries** (often trials) are directories containing Datasets that share a
-  common time base. These directories also contain a "meta" file and any number
+  common time base. These directories contain a `meta.yaml` file and any number
   of Datasets.
 - **SampledData** stored as raw binary arrays. Metadata is stored in another
-  file with ".meta" appended to the dataset's filename.
-- **EventData** stored in CSV files. As above, metadata is stored in a ".meta"
+  file with ".meta.yaml" appended to the dataset's filename.
+- **EventData** stored in CSV files. As above, metadata is stored in a "X.meta.yaml"
   file.
 - Every Bark element (Root, Entry, SampledData, EventData) has metadata stored in associated UTF-8-encoded YAML files.
 
@@ -78,46 +76,51 @@ Every command has help accessible with the flag `-h` (e.g. `bark-root -h`).
 - `bark-root` -- create root directories for experiments
 - `bark-entry` -- create entry directories for datasets
 - `bark-entry-from-prefix` -- create an entry from datasets with matching file prefixes
-- `bark-clean-orphan-metas` -- remove orphan `.meta` files without associated datafiles
-- `bark-scope` -- opens a Bark SampledData file in [neuroscope](http://neurosuite.sourceforge.net/). (Requires an installation of neuroscope)  
+- `bark-clean-orphan-metas` -- remove orphan `.meta.yaml` files without associated datafiles
+- `bark-scope` -- opens a sampled data file in [neuroscope](http://neurosuite.sourceforge.net/). (Requires an installation of neuroscope)  
 - `bark-convert-rhd` -- converts [Intan](http://intantech.com/) .rhd files to datasets in a Bark entry
 - `bark-convert-openephys` -- converts a folder of [Open-Ephys](http://www.open-ephys.org/) .kwd files to datasets in a Bark entry
 - `bark-split` -- splits a dataset according to the split times in a label file, either in a single entry or in an entire bark tree
 - `csv-from-waveclus` -- converts a [wave_clus](https://github.com/csn-le/wave_clus) spike time file to a csv
 - `csv-from-textgrid` -- converts a [praat](http://www.fon.hum.uva.nl/praat/) TextGrid file to a csv
 - `csv-from-lbl` -- converts an [aplot](https://github.com/melizalab/aplot) [lbl](https://github.com/kylerbrown/lbl) file to a csv
 - `csv-from-plexon-csv` -- converts a [Plexon OFS](http://www.plexon.com/products/offline-sorter) waveform csv to a bark csv.
-- `dat-decimate` -- downsamples a raw binary data file by an integer factor, you want to low-pass filter your data first.
-- `dat-segment` -- segments a file based on a band of spectral power, based on [Koumura & Okanoya](dx.doi.org/10.1371/journal.pone.0159188)
+- `dat-decimate` -- downsamples a sampled dataset by an integer factor, you want to low-pass filter your data first.
+- `dat-select` -- extract a subset of channels from a sampled dataset
+- `dat-join` -- combine the channels of two or more sampled datasets
+- `dat-filter` -- apply zero-phase Butterworth or Bessel filters to a sampled dataset
+- `dat-diff` -- subtract one sampled dataset channel from another
+- `dat-cat` -- concatentate sampled datasets, adding more samples
+- `dat-to-wave-clus` -- convert a sampled dataset to a [wave_clus](https://github.com/csn-le/wave_clus)
+  compatible Matlab file
+- `dat-to-wav` -- convert a sampled dataset to a WAVE file.
+- `dat-ref` -- for each channel: subtract the mean of all other channels, scaled by a coefficient such that the total power is minimized
+- `dat-artifact` -- removes sections of a sampled dataset that exceed a threshold
+- `dat-enrich` -- concatenates subsets of a sampled dataset based on events in an events dataset
+- `dat-segment` -- segments a sampled dataset based on a band of spectral power, as described in [Koumura & Okanoya](dx.doi.org/10.1371/journal.pone.0159188)
 - `bark-label-view` -- Annotate or review events in relation to a sampled dataset, such as birdsong syllable labels on a microphone recording.
 
-For processing continuously sampled data, try the included python moduled `bark.stream` or the 
-[datutils](https://github.com/kylerbrown/datutils) project, which provide a command line interface
-for common data pipelines and adhere to the Bark/ARF standard.
-
-Over time, tools from `datutils` will migrate into `bark`.
-
 There are many tools for processing CSV files, including [pandas](http://pandas.pydata.org/) and [csvkit](https://csvkit.readthedocs.io).
 
 # Python interface
-
-    import bark
-    root = bark.read_root("black5")
-    root.entries.keys()
-    # dict_keys(['2016-01-18', '2016-01-19', '2016-01-17', '2016-01-20', '2016-01-21'])
-    entry = root['2016-01-18']
-    entry.attrs
-    # {'bird': 'black5',
-    # 'experiment': 'hvc_syrinx_screwdrive',
-    # 'experimenter': 'kjbrown',
-    # 'timestamp': [1453096800, 0],
-    # 'uuid': 'a53d24af-ac13-4eb3-b5f4-0600a14bb7b0'}
-    entry.datasets.keys()
-    # dict_keys(['enr_emg.dat', 'enr_mic.dat', 'enr_emg_times.csv', 'enr_hvc.dat', 'raw.label', 'enr_hvc_times.csv', 'enr.label'])
-    hvc = entry['enr_hvc.dat']
-    hvc.data.shape
-    # (7604129, 3)
-
+```python
+import bark
+root = bark.read_root("black5")
+root.entries.keys()
+# dict_keys(['2016-01-18', '2016-01-19', '2016-01-17', '2016-01-20', '2016-01-21'])
+entry = root['2016-01-18']
+entry.attrs
+# {'bird': 'black5',
+# 'experiment': 'hvc_syrinx_screwdrive',
+# 'experimenter': 'kjbrown',
+# 'timestamp': [1453096800, 0],
+# 'uuid': 'a53d24af-ac13-4eb3-b5f4-0600a14bb7b0'}
+entry.datasets.keys()
+# dict_keys(['enr_emg.dat', 'enr_mic.dat', 'enr_emg_times.csv', 'enr_hvc.dat', 'raw.label', 'enr_hvc_times.csv', 'enr.label'])
+hvc = entry['enr_hvc.dat']
+hvc.data.shape
+# (7604129, 3)
+```
 
 
 The `Stream` object in the `bark.stream` module exposes a powerful data pipeline design system for sampled data.
@@ -128,9 +131,9 @@ Example usage:
 
 ## Other common tasks
 
-- Recursively search for datafile by metadata: `grep -R --include "*.meta" "source: hvc" PATH/TO/DATA`
-- Recursively search for an entry or root by metadata: `grep -R --include "meta" "experimenter: kjbrown" PATH/TO/DATA`
-- Add new metadata to file: `echo "condition: control" >> FILE.meta`
+- Recursively search for datafile by metadata: `grep -R --include "*.meta.yaml" "source: hvc" PATH/TO/DATA`
+- Recursively search for an entry by metadata: `grep -R --include "meta.yaml" "experimenter: kjbrown" PATH/TO/DATA`
+- Add new metadata to file: `echo "condition: control" >> FILE.meta.yaml`
 
 # Related projects
 
@@ -140,3 +143,8 @@ Example usage:
 -   neuroshare (<http://neuroshare.org>) is a set of routines for reading and
     writing data in various proprietary and open formats.
 
+# Authors
+
+Dan Meliza created ARF.
+Bark was was written by Kyler Brown so he could finish his damn thesis in 2017. Graham Fetterman also made
+considerable contributions.
diff --git a/bark/bark.py b/bark/bark.py
@@ -44,7 +44,7 @@
 
 # hierarchical classes
 class Root():
-    def __init__(self, path, entries=None, attrs=None):
+    def __init__(self, path, entries=None):
         if entries is None or attrs is None:
             self.read(path)
         else:
@@ -56,7 +56,6 @@ def __init__(self, path, entries=None, attrs=None):
     def read(self, name):
         self.path = os.path.abspath(name)
         self.name = os.path.split(self.path)[-1]
-        self.attrs = read_metadata(os.path.join(self.path, "meta"))
         all_sub = [os.path.join(name, x) for x in listdir(self.path)]
         subdirs = [x for x in all_sub if os.path.isdir(x) and x[-1] != '.']
         self.entries = {os.path.split(x)[-1]: read_entry(x) for x in subdirs}
@@ -131,22 +130,52 @@ def write(self, path=None):
         write_events(path, self.data, **self.attrs)
 
 
-def write_sampled(datfile, data, sampling_rate, units, **params):
+def template_columns(fields):
+    return {f: {'units': None} for f in fields}
+
+
+def event_columns(dataframe, columns=None):
+    if columns is None:
+        return template_columns(dataframe.columns)
+    for fieldkey in columns:
+        if fieldkey not in datafram.columns:
+            del columns[fieldkey]
+    for col in dataframe.columns:
+        if col not in columns:
+            columns[col] = {'units': None}
+        if 'units' not in columns[field]:
+            columns[field]['units'] = None
+
+
+def sampled_columns(data, columns=None):
+    'If columns=None, create new columns attribute, otherwise verify columns.'
     if len(data.shape) == 1:
-        params["n_channels"] = 1
+        n_channels = 1
     else:
-        params["n_channels"] = data.shape[1]
+        n_channels = data.shape[1]
+    if columns is None:
+        return template_columns(range(n_channels))
+    if len(columns) != n_channels:
+        raise ValueError(
+            'the columns attribute does not match the number of columns')
+    for i in range(n_channels):
+        if i not in columns:
+            raise ValueError(
+                'the columns attribute is missing column {}'.format(i))
+        if 'units' not in columns[i]:
+            columns[i]['units'] = None
+
+
+def write_sampled(datfile, data, sampling_rate, **params):
+    if 'columns' not in params:
+        params['columns'] = sampled_columns(data)
     params["dtype"] = data.dtype.str
     shape = data.shape
     mdata = np.memmap(datfile, dtype=params["dtype"], mode="w+", shape=shape)
     mdata[:] = data[:]
     params["filetype"] = "rawbinary"
-    write_metadata(datfile + ".meta",
-                   sampling_rate=sampling_rate,
-                   units=units,
-                   **params)
+    write_metadata(datfile, sampling_rate=sampling_rate, **params)
     params['sampling_rate'] = sampling_rate
-    params['units'] = units
     return SampledData(mdata, datfile, params)
 
 
@@ -157,79 +186,71 @@ def read_sampled(datfile, mode="r"):
     recommended).
     """
     path = os.path.abspath(datfile)
-    params = read_metadata(datfile + ".meta")
+    params = read_metadata(datfile)
     data = np.memmap(datfile, dtype=params["dtype"], mode=mode)
-    data = data.reshape(-1, params["n_channels"])
+    data = data.reshape(-1, len(params['columns']))
     return SampledData(data, path, params)
 
 
 def write_events(eventsfile, data, **params):
-    assert "units" in params and params["units"] in UNITS.TIME_UNITS
+    if 'columns' not in params:
+        params['columns'] = event_columns(data)
     data.to_csv(eventsfile, index=False)
     params["filetype"] = "csv"
-    write_metadata(eventsfile + ".meta", **params)
+    write_metadata(eventsfile, **params)
     return read_events(eventsfile)
 
 
 def read_events(eventsfile):
     import pandas as pd
     data = pd.read_csv(eventsfile).fillna('')
-    params = read_metadata(eventsfile + ".meta")
+    params = read_metadata(eventsfile)
     return EventData(data, eventsfile, params)
 
 
 def read_dataset(fname):
     "determines if file is sampled or event data and reads accordingly"
-    params = read_metadata(fname + ".meta")
-    if "units" in params and params["units"] in UNITS.TIME_UNITS:
+    params = read_metadata(fname)
+    if params["filetype"] == "csv":
         dset = read_events(fname)
-    else:
+    elif params["filetype"] == "rawbinary":
         dset = read_sampled(fname)
+    else:
+        raise ValueError('Unrecognized file format {}'.format(params[
+            'filetype']))
     return dset
 
 
-def read_metadata(metafile):
-    try:
-        with codecs.open(metafile, 'r', encoding='utf-8') as fp:
-            params = yaml.safe_load(fp)
-        return params
-    except IOError as err:
-        fname = os.path.splitext(metafile)[0]
-        if fname == "meta":  # this was a root or entry metafile
-            return {}
-        elif os.path.splitext(fname)[-1] == '.meta':
+def read_metadata(path, meta='meta.yaml'):
+    if os.path.isdir(path):
+        metafile = os.path.join(path, meta)
+        return yaml.safe_load(open(metafile, 'r'))
+    if os.path.isfile(path):
+        metafile = path + '.' + meta
+        if os.path.isfile(metafile):
+            return yaml.safe_load(open(metafile, 'r'))
+        elif os.path.splitext(fname)[-1] == '.' + meta:
             print("Tried to open metadata file instead of data file.")
-        elif os.path.exists(fname):
-            print(
-                "{} is missing an associated .meta file, should named {}.meta"
-                .format(fname, fname))
-        else:
-            print("{} does not exist".format(fname))
-        sys.exit(0)
+        if os.path.exists(fname):
+            print("{} is missing an associated meta file, should named {}"
+                  .format(path, meta))
+    else:
+        print("{} does not exist".format(fname))
+    sys.exit(0)
 
 
-def write_metadata(filename, **params):
+def write_metadata(path, meta='meta.yaml', **params):
+    if os.path.isdir(path):
+        metafile = os.path.join(path, meta)
+    else:
+        metafile = path + '.' + meta
     for k, v in params.items():
         if isinstance(v, (np.ndarray, np.generic)):
             params[k] = v.tolist()
-    with codecs.open(filename, 'w', encoding='utf-8') as yaml_file:
-        header = """# metadata using YAML syntax\n---\n"""
-        yaml_file.write(header)
+    with codecs.open(metafile, 'w', encoding='utf-8') as yaml_file:
         yaml_file.write(yaml.safe_dump(params, default_flow_style=False))
 
 
-def create_root(name, parents=False, **attrs):
-    """creates a new BARK top-level directory"""
-    path = os.path.abspath(name)
-    if os.path.isdir(path):
-        if not parents:
-            raise IOError("{} already exists".format(path))
-    else:
-        os.makedirs(path)
-    write_metadata(os.path.join(path, "meta"), **attrs)
-    return Root(name)
-
-
 def read_root(name):
     return Root(name)
 
@@ -265,16 +286,16 @@ def create_entry(name, timestamp, parents=False, **attributes):
     if "uuid" not in attributes:
         attributes["uuid"] = str(uuid4())
     attributes["timestamp"] = convert_timestamp(timestamp)
-    write_metadata(os.path.join(name, "meta"), **attributes)
+    write_metadata(os.path.join(name), **attributes)
     return read_entry(name)
 
 
 def read_entry(name):
     path = os.path.abspath(name)
     dsets = {}
-    attrs = read_metadata(os.path.join(path, "meta"))
+    attrs = read_metadata(os.path.join(path))
     # load only files with associated metadata files
-    dset_metas = glob(os.path.join(path, "*.meta"))
+    dset_metas = glob(os.path.join(path, "*.meta.yaml"))
     dset_full_names = [x[:-5] for x in dset_metas]
     dset_names = [os.path.split(x)[-1] for x in dset_full_names]
     datasets = {name: read_dataset(full_name)

diff --git a/bark/io/lbl.py b/bark/io/lbl.py
@@ -20,7 +20,7 @@ def lbl_to_csv(fname, csvname, **attrs):
     lblstruct = read(fname)
     csvdata = pd.DataFrame(lblstruct)
     csvdata.to_csv(csvname, index=False)
-    write_metadata(csvname + ".meta", **attrs)
+    write_metadata(csvname, **attrs)
 
 def _lbl_csv():
     'commandline script for lbl->csv conversion'

diff --git a/bark/io/openephys/kwik2dat.py b/bark/io/openephys/kwik2dat.py
@@ -48,15 +48,16 @@ def eofolder2entry(oefolder, entry_name, timestamp=None, parents=False, **attrs)
 
 def write_from_kwd(kwd, dat):
     all_data = load_all(kwd)
-    sampling_rate = all_data[0]["info"]["sample_rate"]
     n_channels = all_data[0]['data'].shape[1]
     for group_i, data in enumerate(all_data):
         write_binary(dat, data["data"])
-    # reopen to deterimine number of samples
-    temp  = np.memmap(dat, dtype="int16", mode="r").reshape(-1, n_channels)
-    n_samples = temp.shape[0]
-    write_metadata(dat + ".meta", sampling_rate=sampling_rate, 
-            n_samples=n_samples, n_channels=n_channels, dtype="int16")
+        assert data["data"].shape[1] == n_channels
+    sampling_rate = data["info"]["sample_rate"]
+    columns = {i: {'units': 'uV', 
+                   'unit_scale': float(data['app_attrs']['channel_bit_volts'][i])} 
+                   for i in range(n_channels)}
+    write_metadata(dat, sampling_rate=sampling_rate, 
+            dtype=data['data'].dtype.str, columns=columns)
 
 
 def kwd_to_entry():