pachterlab
diff --git a/‎.github/workflows/ci.yml
+1-1 b/‎.github/workflows/ci.yml
+1-1
diff --git a/‎.github/workflows/release.yml
+1-1 b/‎.github/workflows/release.yml
+1-1
diff --git a/‎.gitignore
+6 b/‎.gitignore
+6
diff --git a/‎Makefile
+2-1 b/‎Makefile
+2-1
diff --git a/‎README.md
+4-3 b/‎README.md
+4-3
diff --git a/‎dev-requirements.txt
+3-2 b/‎dev-requirements.txt
+3-2
diff --git a/‎docs/conf.py
+1-1 b/‎docs/conf.py
+1-1
diff --git a/‎docs/index.rst
+3-3 b/‎docs/index.rst
+3-3
diff --git a/‎kb_python/__init__.py
+1-1 b/‎kb_python/__init__.py
+1-1
diff --git a/‎kb_python/bins/darwin/bustools/bustools
15.3 KB b/‎kb_python/bins/darwin/bustools/bustools
15.3 KB
diff --git a/‎kb_python/bins/darwin/kallisto/kallisto
12.5 KB b/‎kb_python/bins/darwin/kallisto/kallisto
12.5 KB
diff --git a/‎kb_python/bins/darwin/kallisto/kallisto_k64
2.19 MB b/‎kb_python/bins/darwin/kallisto/kallisto_k64
2.19 MB
diff --git a/‎kb_python/bins/darwin/kallisto/kallisto_optoff
2.13 MB b/‎kb_python/bins/darwin/kallisto/kallisto_optoff
2.13 MB
diff --git a/‎kb_python/bins/darwin/kallisto/kallisto_optoff_k64
2.15 MB b/‎kb_python/bins/darwin/kallisto/kallisto_optoff_k64
2.15 MB
diff --git a/‎kb_python/bins/darwin/m1/bustools/bustools
15.3 KB b/‎kb_python/bins/darwin/m1/bustools/bustools
15.3 KB
diff --git a/‎kb_python/bins/darwin/m1/kallisto/kallisto
103 KB b/‎kb_python/bins/darwin/m1/kallisto/kallisto
103 KB
diff --git a/‎kb_python/bins/darwin/m1/kallisto/kallisto_k64
1.91 MB b/‎kb_python/bins/darwin/m1/kallisto/kallisto_k64
1.91 MB
diff --git a/‎kb_python/bins/darwin/m1/kallisto/kallisto_optoff
1.91 MB b/‎kb_python/bins/darwin/m1/kallisto/kallisto_optoff
1.91 MB
diff --git a/‎kb_python/bins/darwin/m1/kallisto/kallisto_optoff_k64
1.91 MB b/‎kb_python/bins/darwin/m1/kallisto/kallisto_optoff_k64
1.91 MB
diff --git a/‎kb_python/bins/linux/bustools/bustools
2.92 KB b/‎kb_python/bins/linux/bustools/bustools
2.92 KB
diff --git a/‎kb_python/bins/linux/kallisto/kallisto
-560 KB b/‎kb_python/bins/linux/kallisto/kallisto
-560 KB
diff --git a/‎kb_python/bins/linux/kallisto/kallisto_k64
8.07 MB b/‎kb_python/bins/linux/kallisto/kallisto_k64
8.07 MB
diff --git a/‎kb_python/bins/linux/kallisto/kallisto_optoff
8.02 MB b/‎kb_python/bins/linux/kallisto/kallisto_optoff
8.02 MB
diff --git a/‎kb_python/bins/linux/kallisto/kallisto_optoff_k64
8.03 MB b/‎kb_python/bins/linux/kallisto/kallisto_optoff_k64
8.03 MB
diff --git a/‎kb_python/bins/windows/bustools/bustools.exe
3.18 KB b/‎kb_python/bins/windows/bustools/bustools.exe
3.18 KB
diff --git a/‎kb_python/bins/windows/kallisto/kallisto.exe
-1.1 MB b/‎kb_python/bins/windows/kallisto/kallisto.exe
-1.1 MB
diff --git a/‎kb_python/bins/windows/kallisto/kallisto_k64.exe
10.4 MB b/‎kb_python/bins/windows/kallisto/kallisto_k64.exe
10.4 MB
diff --git a/‎kb_python/bins/windows/kallisto/kallisto_optoff.exe
10.4 MB b/‎kb_python/bins/windows/kallisto/kallisto_optoff.exe
10.4 MB
diff --git a/‎kb_python/bins/windows/kallisto/kallisto_optoff_k64.exe
10.4 MB b/‎kb_python/bins/windows/kallisto/kallisto_optoff_k64.exe
10.4 MB
diff --git a/‎kb_python/config.py
+17-3 b/‎kb_python/config.py
+17-3
diff --git a/‎kb_python/count.py
+83-8 b/‎kb_python/count.py
+83-8
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [3.7, 3.8, 3.9 ]
+        python: [3.8, 3.9 ]
         os: [ubuntu-20.04]
     name: Test on Python ${{ matrix.python }}
     steps:
 
@@ -14,7 +14,7 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v1
         with:
-          python-version: '3.7'
+          python-version: '3.8'
           architecture: x64
       - name: Install dependencies
         run: pip install -r dev-requirements.txt
 
@@ -109,3 +109,9 @@ venv.bak/
 .DS_Store
 .vscode/
 .Rhistory
+
+# PyCharm
+/.idea/
+
+# Temp files
+/scratch/
@@ -2,7 +2,8 @@
 
 test:
 	rm -f .coverage
-	nosetests --verbose --with-coverage --cover-package kb_python tests/* tests/dry/*
+	pytest --verbose --cov=kb_python tests/* tests/dry/* && coverage report && coverage xml
+#	nosetests --verbose --with-coverage --cover-package kb_python tests/* tests/dry/*
 
 check:
 	flake8 kb_python && echo OK
 
@@ -1,5 +1,5 @@
 # kb-python
-![github version](https://img.shields.io/badge/Version-0.28.0-informational)
+![github version](https://img.shields.io/badge/Version-0.29.0-informational)
 [![pypi version](https://img.shields.io/pypi/v/kb-python)](https://pypi.org/project/kb-python/0.28.0/)
 ![python versions](https://img.shields.io/pypi/pyversions/kb_python)
 ![status](https://github.com/pachterlab/kb_python/workflows/CI/badge.svg)
@@ -10,7 +10,7 @@
 
 `kb-python` is a python package for processing single-cell RNA-sequencing. It wraps the [`kallisto` | `bustools`](https://www.kallistobus.tools) single-cell RNA-seq command line tools in order to unify multiple processing workflows. 
 
-`kb-python` was developed by [Kyung Hoi (Joseph) Min](https://twitter.com/lioscro) and [A. Sina Booeshaghi](https://twitter.com/sinabooeshaghi) while in [Lior Pachter](https://twitter.com/lpachter)'s lab at Caltech. If you use `kb-python` in a publication please [cite*](#cite):
+`kb-python` was first developed by [Kyung Hoi (Joseph) Min](https://twitter.com/lioscro) and [A. Sina Booeshaghi](https://twitter.com/sinabooeshaghi) while in [Lior Pachter](https://twitter.com/lpachter)'s lab at Caltech. If you use `kb-python` in a publication please [cite*](#cite):
 ```
 Melsted, P., Booeshaghi, A.S., et al. 
 Modular, efficient and constant-memory single-cell RNA-seq preprocessing. 
@@ -34,7 +34,7 @@ There are no prerequisite packages to install. The `kallisto` and `bustools` bin
 
 ## Usage
 
-`kb`  consists of four subcommands
+`kb`  consists of five subcommands
 ```bash
 $ kb
 usage: kb [-h] [--list] <CMD> ...
@@ -44,6 +44,7 @@ positional arguments:
     compile   Compile `kallisto` and `bustools` binaries from source
     ref       Build a kallisto index and transcript-to-gene mapping
     count     Generate count matrices from a set of single-cell FASTQ files
+    extract   Extract reads that were pseudoaligned to specific genes/transcripts (or extract all reads that were / were not pseudoaligned)
 ```
 
 ### `kb ref`: generate a pseudoalignment index
 
@@ -1,7 +1,8 @@
 bumpversion==0.6.0
-coverage==5.1
+coverage==5.2.1
 flake8==3.8.2
-nose==1.3.7
+pytest==8.2.2
+pytest-cov==5.0.0
 pre-commit==2.4.0
 sphinx>=3.3.1
 sphinx-autoapi>=1.5.1
 
@@ -24,7 +24,7 @@
 author = 'Kyung Hoi (Joseph) Min'
 
 # The full version, including alpha/beta/rc tags
-release = '0.28.2'
+release = '0.29.0'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------
 
@@ -6,7 +6,7 @@
 Welcome to kb-python's documentation!
 =====================================
 
-This page contains **DEVELOPER** documentation for ``kb-python`` version ``0.28.2``.
+This page contains **DEVELOPER** documentation for ``kb-python`` version ``0.29.0``.
 For user documentation and tutorials, please go to `kallisto | bustools <https://www.kallistobus.tools/>`_.
 
 Development Prerequisites
@@ -18,7 +18,7 @@ necessary packages by running::
   pip install -r requirements.txt
   pip install -r dev-requirements.txt
 
-Code qualty and unit tests are strictly enforced for every pull request via
+Code quality and unit tests are strictly enforced for every pull request via
 Github actions.
 
 Code Quality
@@ -33,7 +33,7 @@ at the root of the repository.
 
 Unit-testing
 """"""""""""
-``kb-python`` uses ``nose`` to run unit tests. There is a convenient Makefile
+``kb-python`` uses ``pytest`` to run unit tests. There is a convenient Makefile
 rule in place to run all tests.::
 
   make test
 
@@ -1 +1 @@
-__version__ = '0.28.2'
+__version__ = '0.29.0'
@@ -34,7 +34,14 @@ def get_provided_kallisto_path() -> Optional[str]:
     Returns:
         Path to the binary, `None` if not found
     """
-    bin_filename = 'kallisto.exe' if PLATFORM == 'windows' else 'kallisto'
+    bin_name = 'kallisto'
+    if '_KALLISTO_OPTOFF' in globals():
+        if _KALLISTO_OPTOFF:
+            bin_name = f'{bin_name}_optoff'
+    if '_KALLISTO_KMER_64' in globals():
+        if _KALLISTO_KMER_64:
+            bin_name = f'{bin_name}_k64'
+    bin_filename = f'{bin_name}.exe' if PLATFORM == 'windows' else bin_name
     path = os.path.join(BINS_DIR, PLATFORM, CPU, 'kallisto', bin_filename)
     if not os.path.isfile(path):
         return None
@@ -54,11 +61,18 @@ def get_provided_bustools_path() -> Optional[str]:
     return path
 
 
+def set_special_kallisto_binary(k64: bool, optoff: bool):
+    global _KALLISTO_KMER_64
+    global _KALLISTO_OPTOFF
+    _KALLISTO_KMER_64 = k64
+    _KALLISTO_OPTOFF = optoff
+
+
 def get_compiled_kallisto_path(alias: str = COMPILED_DIR) -> Optional[str]:
     """Finds platform-dependent kallisto binary compiled with `compile`.
 
     Args:
-        Alias: Alias of compiled binary.
+        alias: Alias of compiled binary.
 
     Returns:
         Path to the binary, `None` if not found
@@ -74,7 +88,7 @@ def get_compiled_bustools_path(alias: str = COMPILED_DIR) -> Optional[str]:
     """Finds platform-dependent bustools binary compiled with `compile`.
 
     Args:
-        Alias: Alias of compiled binary.
+        alias: Alias of compiled binary.
 
     Returns:
         Path to the binary, `None` if not found
 
@@ -105,6 +105,11 @@ def kallisto_bus(
     demultiplexed: bool = False,
     batch_barcodes: bool = False,
     numreads: int = None,
+    lr: bool = False,
+    lr_thresh: float = 0.8,
+    lr_error_rate: float = None,
+    union: bool = False,
+    no_jump: bool = False,
 ) -> Dict[str, str]:
     """Runs `kallisto bus`.
 
@@ -133,6 +138,11 @@ def kallisto_bus(
         demultiplexed: Whether FASTQs are demultiplexed, defaults to `False`
         batch_barcodes: Whether sample ID should be in barcode, defaults to `False`
         numreads: Maximum number of reads to process from supplied input
+        lr: Whether to use lr-kallisto in read mapping, defaults to `False`
+        lr_thresh: Sets the --threshold for lr-kallisto, defaults to `0.8`
+        lr_error_rate: Sets the --error-rate for lr-kallisto, defaults to `None`
+        union: Use set union for pseudoalignment, defaults to `False`
+        no_jump: Disable pseudoalignment "jumping", defaults to `False`
 
     Returns:
         Dictionary containing paths to generated files
@@ -194,6 +204,16 @@ def kallisto_bus(
         command += ['--rf-stranded']
     if inleaved:
         command += ['--inleaved']
+    if lr:
+        command += ['--long']
+    if lr and lr_thresh:
+        command += ['-r', str(lr_thresh)]
+    if lr and lr_error_rate:
+        command += ['-e', str(lr_error_rate)]
+    if union:
+        command += ['--union']
+    if no_jump:
+        command += ['--no-jump']
     if batch_barcodes:
         command += ['--batch-barcodes']
     if is_batch:
@@ -224,12 +244,14 @@ def kallisto_quant_tcc(
     matrix_to_files: bool = False,
     matrix_to_directories: bool = False,
     no_fragment: bool = False,
+    lr: bool = False,
+    lr_platform: str = 'ONT',
 ) -> Dict[str, str]:
     """Runs `kallisto quant-tcc`.
 
     Args:
         mtx_path: Path to counts matrix
-        saved_index_path: Path to index.saved
+        saved_index_path: Path to index
         ecmap_path: Path to ecmap
         t2g_path: Path to T2G
         out_dir: Output directory path
@@ -241,6 +263,8 @@ def kallisto_quant_tcc(
         matrix_to_files: Whether to write quant-tcc output to files, defaults to `False`
         matrix_to_directories: Whether to write quant-tcc output to directories, defaults to `False`
         no_fragment: Whether to disable quant-tcc effective length normalization, defaults to `False`
+        lr: Whether to use lr-kallisto in quantification, defaults to `False`
+        lr_platform: Sets the --platform for lr-kallisto, defaults to `ONT`
 
     Returns:
         Dictionary containing path to output files
@@ -255,6 +279,10 @@ def kallisto_quant_tcc(
     command += ['-e', ecmap_path]
     command += ['-g', t2g_path]
     command += ['-t', threads]
+    if lr:
+        command += ['--long']
+    if lr and lr_platform:
+        command += ['-P', lr_platform]
     if flens_path and not no_fragment:
         command += ['-f', flens_path]
     if l and not no_fragment:
@@ -1178,6 +1206,14 @@ def count(
     no_fragment: bool = False,
     numreads: int = None,
     store_num: bool = False,
+    lr: bool = False,
+    lr_thresh: float = 0.8,
+    lr_error_rate: float = None,
+    lr_platform: str = 'ONT',
+    union: bool = False,
+    no_jump: bool = False,
+    quant_umis: bool = False,
+    keep_flags: bool = False,
 ) -> Dict[str, Union[str, Dict[str, str]]]:
     """Generates count matrices for single-cell RNA seq.
 
@@ -1242,6 +1278,14 @@ def count(
         no_fragment: Whether to disable quant-tcc effective length normalization, defaults to `False`
         numreads: Maximum number of reads to process from supplied input
         store_num: Whether to store read numbers in BUS file, defaults to `False`
+        lr: Whether to use lr-kallisto in read mapping, defaults to `False`
+        lr_thresh: Sets the --threshold for lr-kallisto, defaults to `0.8`
+        lr_error_rate: Sets the --error-rate for lr-kallisto, defaults to `None`
+        lr_platform: Sets the --platform for lr-kallisto, defaults to `ONT`
+        union: Use set union for pseudoalignment, defaults to `False`
+        no_jump: Disable pseudoalignment "jumping", defaults to `False`
+        quant_umis: Whether to run quant-tcc when there are UMIs, defaults to `False`
+        keep_flags: Preserve flag column when sorting BUS file, defaults to `False`
 
     Returns:
         Dictionary containing paths to generated files
@@ -1292,7 +1336,12 @@ def count(
             demultiplexed=demultiplexed,
             batch_barcodes=batch_barcodes,
             numreads=numreads,
-            n=store_num
+            n=store_num,
+            lr=lr,
+            lr_thresh=lr_thresh,
+            lr_error_rate=lr_error_rate,
+            union=union,
+            no_jump=no_jump
         )
     else:
         logger.info(
@@ -1309,7 +1358,7 @@ def count(
         temp_dir=temp_dir,
         threads=threads,
         memory=memory,
-        store_num=store_num
+        store_num=store_num and not keep_flags
     )
     correct = True
     if whitelist_path and whitelist_path.upper() == "NONE":
@@ -1404,6 +1453,9 @@ def update_results_with_suffix(current_results, new_results, suffix):
         technology.upper() in ('BULK', 'SMARTSEQ2', 'SMARTSEQ3')
     ) or ignore_umis
     quant = cm and tcc
+    if quant_umis:
+        quant = True
+        no_fragment = True
     suffix_to_inspect_filename = {'': ''}
     if (technology.upper() == 'SMARTSEQ3'):
         suffix_to_inspect_filename = {
@@ -1518,6 +1570,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
                     matrix_to_files=matrix_to_files,
                     matrix_to_directories=matrix_to_directories,
                     no_fragment=no_fragment,
+                    lr=lr,
+                    lr_platform=lr_platform,
                 )
                 update_results_with_suffix(
                     unfiltered_results, quant_result, suffix
@@ -1695,6 +1749,14 @@ def count_nac(
     batch_barcodes: bool = False,
     numreads: int = None,
     store_num: bool = False,
+    lr: bool = False,
+    lr_thresh: float = 0.8,
+    lr_error_rate: float = None,
+    lr_platform: str = 'ONT',
+    union: bool = False,
+    no_jump: bool = False,
+    quant_umis: bool = False,
+    keep_flags: bool = False,
 ) -> Dict[str, Union[Dict[str, str], str]]:
     """Generates RNA velocity matrices for single-cell RNA seq.
 
@@ -1756,6 +1818,14 @@ def count_nac(
         batch_barcodes: Whether sample ID should be in barcode, defaults to `False`
         numreads: Maximum number of reads to process from supplied input
         store_num: Whether to store read numbers in BUS file, defaults to `False`
+        lr: Whether to use lr-kallisto in read mapping, defaults to `False`
+        lr_thresh: Sets the --threshold for lr-kallisto, defaults to `0.8`
+        lr_error_rate: Sets the --error-rate for lr-kallisto, defaults to `None`
+        lr_platform: Sets the --platform for lr-kallisto, defaults to `ONT`
+        union: Use set union for pseudoalignment, defaults to `False`
+        no_jump: Disable pseudoalignment "jumping", defaults to `False`
+        quant_umis: Whether to run quant-tcc when there are UMIs, defaults to `False`
+        keep_flags: Preserve flag column when sorting BUS file, defaults to `False`
 
     Returns:
         Dictionary containing path to generated index
@@ -1803,7 +1873,12 @@ def count_nac(
             demultiplexed=demultiplexed,
             batch_barcodes=batch_barcodes,
             numreads=numreads,
-            n=store_num
+            n=store_num,
+            lr=lr,
+            lr_thresh=lr_thresh,
+            lr_error_rate=lr_error_rate,
+            union=union,
+            no_jump=no_jump
         )
     else:
         logger.info(
@@ -1820,7 +1895,7 @@ def count_nac(
         temp_dir=temp_dir,
         threads=threads,
         memory=memory,
-        store_num=store_num
+        store_num=store_num and not keep_flags
     )
     correct = True
     if whitelist_path and whitelist_path.upper() == "NONE":
@@ -2073,8 +2148,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
                         if batch_barcodes else None for prefix in prefixes
                     ],
                     genes_paths=[
-                        unfiltered_results[prefix][f'txnames{suffix}'] if tcc
-                        else unfiltered_results[prefix].get(f'genes{suffix}')
+                        unfiltered_results[prefix][f'ec{suffix}'] if tcc else
+                        unfiltered_results[prefix].get(f'genes{suffix}')
                         for prefix in prefixes
                     ],
                     t2g_path=t2g_path,
@@ -2975,7 +3050,7 @@ def update_results_with_suffix(current_results, new_results, suffix):
                     for prefix in prefixes
                 ],
                 genes_paths=[
-                    unfiltered_results[prefix][f'txnames{suffix}'] if tcc else
+                    unfiltered_results[prefix][f'ec{suffix}'] if tcc else
                     unfiltered_results[prefix].get(f'genes{suffix}')
                     for prefix in prefixes
                 ],
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.28.2'`
	`1`	`+__version__ = '0.29.0'`