Skip to content

Commit 5b0acf8

Browse files
committed
feat: Cell Ranger 8.0.0
1 parent 7c9ccea commit 5b0acf8

File tree

423 files changed

+21747
-17482
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

423 files changed

+21747
-17482
lines changed

bin/tenkit/sitecheck

+3
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ ifchk "SGE JOB_NAME" "echo \$JOB_NAME"
104104
check "LSF Submit" "which bsub"
105105
ifchk "LSF LSB_JOBNAME" "echo \$LSB_JOBNAME"
106106

107+
check "HTCondor Submit" "which condor_submit"
108+
check "Batch system" "echo \$BATCH_SYSTEM"
109+
107110
check "BCL2FASTQ 1" "which configureBclToFastq.pl"
108111
ifchk "BCL2FASTQ 1 Version" "ls \$(dirname \$(which configureBclToFastq.pl))/../etc"
109112
ifchk "Perl" "which perl"

conda_spec.bzl

+713-289
Large diffs are not rendered by default.

lib/bin/parameters.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
detect_chemistry_sample_reads = 100_000
2-
detect_chemistry_total_reads = 1_000_000
2+
detect_chemistry_total_reads = 2_000_000
33
min_fraction_whitelist_match = 0.1
44
min_barcode_similarity = 0.1
55
star_parameters = ""

lib/python/cellranger/altair_utils.py

+43-11
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,77 @@
66

77
import altair as alt
88
import pandas as pd
9+
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
910

1011
LAYER = "layer"
1112
HCONCAT = "hconcat"
1213
VCONCAT = "vconcat"
1314
DATA = "data"
1415
DATA_NAME = "name"
1516
DATA_FORMAT = "format"
17+
DATASETS_KEY = "datasets"
18+
CSV_PARSE_KEY = "parse"
1619
DATA_FORMAT_CSV_DICT = {"type": "csv"}
1720

1821

19-
def sanitise_data_names(chart_dict):
20-
"""Sanitise data names in chart dict."""
22+
def get_vega_dtype(dtype: pd.DataFrame.dtypes):
23+
"""Convert pandas dtype to vega-lite dtype.
24+
25+
Vega-lite CSV parse
26+
dtypes are - https://vega.github.io/vega-lite-api/api/csv.html#parse .
27+
"""
28+
if is_bool_dtype(dtype):
29+
return "boolean"
30+
elif is_datetime64_any_dtype(dtype):
31+
return "date"
32+
elif is_numeric_dtype(dtype): # A little tricky as bools are also considered numeric.
33+
return "number"
34+
else:
35+
raise ValueError("Expected code to be unreachable")
36+
37+
38+
def sanitise_data_names(chart_dict, name_to_format_dict):
39+
"""Sanitise data names in chart dict.
40+
41+
Uses name_to_format_dict to get column types to parse.
42+
"""
2143
if LAYER in chart_dict:
2244
for layer_dict in chart_dict[LAYER]:
23-
sanitise_data_names(layer_dict)
45+
sanitise_data_names(layer_dict, name_to_format_dict)
2446
if HCONCAT in chart_dict:
2547
for hconcat_dict in chart_dict[HCONCAT]:
26-
sanitise_data_names(hconcat_dict)
48+
sanitise_data_names(hconcat_dict, name_to_format_dict)
2749
if VCONCAT in chart_dict:
2850
for vconcat_dict in chart_dict[VCONCAT]:
29-
sanitise_data_names(vconcat_dict)
51+
sanitise_data_names(vconcat_dict, name_to_format_dict)
3052
if DATA in chart_dict:
3153
if DATA_NAME not in chart_dict[DATA]:
3254
raise ValueError("Found nameless data in Altair JSON. Not expected VEGA out.")
3355
else:
56+
old_data_name = chart_dict[DATA][DATA_NAME]
57+
chart_dict[DATA][DATA_FORMAT] = name_to_format_dict[old_data_name]
3458
chart_dict[DATA][DATA_NAME] += ".csv"
35-
chart_dict[DATA][DATA_FORMAT] = DATA_FORMAT_CSV_DICT
3659

3760

3861
def sanitise_chart_dict(chart_dict):
3962
"""Clean up the chart dict."""
4063
# Convert all datasets into CSVs from JSONs
41-
raw_dataset_names = list(chart_dict["datasets"].keys())
64+
raw_dataset_names = list(chart_dict[DATASETS_KEY].keys())
65+
name_to_format_dict = {}
4266
for raw_dataset in raw_dataset_names:
43-
csv_string = pd.DataFrame.from_dict(chart_dict["datasets"][raw_dataset]).to_csv(index=False)
67+
name_to_format_dict[raw_dataset] = DATA_FORMAT_CSV_DICT.copy()
68+
df = pd.DataFrame.from_dict(chart_dict[DATASETS_KEY][raw_dataset])
69+
csv_string = df.to_csv(index=False)
70+
# Getting columns with special parse
71+
name_to_format_dict[raw_dataset][CSV_PARSE_KEY] = {
72+
x: get_vega_dtype(y)
73+
for x, y in df.dtypes.items()
74+
if (is_numeric_dtype(y) or is_datetime64_any_dtype(y) or is_bool_dtype(y))
75+
}
4476
new_dataset_name = f"{raw_dataset}.csv"
45-
chart_dict["datasets"][new_dataset_name] = csv_string
46-
chart_dict["datasets"].pop(raw_dataset)
47-
sanitise_data_names(chart_dict)
77+
chart_dict[DATASETS_KEY][new_dataset_name] = csv_string
78+
chart_dict[DATASETS_KEY].pop(raw_dataset)
79+
sanitise_data_names(chart_dict, name_to_format_dict)
4880

4981

5082
def chart_to_json(

lib/python/cellranger/analysis/combinatorics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def generate_all_multiplets(n_tags: int, max_multiplets: int, add_unit_vector_at
5757
x_1 + x_2 + x_3 + ... = k for k = 1..max_multiplets
5858
"""
5959
solutions: list[list[int]] = []
60-
for k in range(0, max_multiplets + 1):
60+
for k in range(max_multiplets + 1):
6161
cur_solutions = list(generate_solutions(n_tags, k))
6262
assert int(calc_all_possible_nonnegative_solutions(n_tags, k)) == len(cur_solutions)
6363
solutions += cur_solutions

lib/python/cellranger/analysis/irlb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def irlb(
219219
V[:, k] = F
220220
B = np.zeros((m_b, m_b))
221221
# Improve this! There must be better way to assign diagonal...
222-
for l in range(0, k):
222+
for l in range(k):
223223
B[l, l] = S[1][l]
224224
B[0:k, k] = R[0:k]
225225
# Update the left approximate singular vectors

lib/python/cellranger/analysis/multigenome.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,15 @@ def classify_gems(
154154
counts1[counts1 > counts0], analysis_constants.MULTIPLET_PROB_THRESHOLD * 100.0
155155
)
156156

157+
# If input is a pure species instead of a mixture then modeling counts independently
158+
# can result in an absurdly low threshold for the missing species causing FP labels that
159+
# show up as an inflated number of Multiplets.
160+
thresholds = sorted([thresh0, thresh1])
161+
fold_change = thresholds[1] / thresholds[0]
162+
if (thresholds[0] < 50) and (fold_change > 25):
163+
thresh0 = thresh1 = np.percentile(
164+
counts0 + counts1, analysis_constants.MULTIPLET_PROB_THRESHOLD * 100.0
165+
)
157166
doublet = np.logical_and(counts0 >= thresh0, counts1 >= thresh1)
158167
dtype = np.dtype(("S", max(len(cls) for cls in analysis_constants.GEM_CLASSES)))
159168
result = np.where(
@@ -226,7 +235,7 @@ def _infer_multiplets(
226235
np.random.seed(0)
227236

228237
n_multiplet_boot: np.ndarray[int, np.dtype[np.float_]] = np.zeros(bootstraps)
229-
for i in range(0, bootstraps):
238+
for i in range(bootstraps):
230239
boot_idx = np.random.choice(len(counts0), len(counts0))
231240
counts0_boot = counts0[boot_idx]
232241
counts1_boot = counts1[boot_idx]

lib/python/cellranger/cell_calling.py

+34-16
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,28 @@
1515
import cellranger.sgt as cr_sgt
1616
import cellranger.stats as cr_stats
1717
from cellranger.analysis.diffexp import adjust_pvalue_bh
18-
from cellranger.chemistry import CHEMISTRY_DESCRIPTION_FIELD, CHEMISTRY_SC3P_LT
18+
from cellranger.chemistry import (
19+
CHEMISTRY_DESCRIPTION_FIELD,
20+
CHEMISTRY_SC3P_LT,
21+
SC3P_V4_CHEMISTRIES,
22+
SC5P_V3_CHEMISTRIES,
23+
)
1924

2025
# Drop this top fraction of the barcodes when estimating ambient.
2126
MAX_OCCUPIED_PARTITIONS_FRAC = 0.5
2227

28+
# Minimum number of UMIs for a barcode to be called as a cell
29+
MIN_GLOBAL_UMIS = 0
30+
31+
# Maximum percentage of mitochondrial reads allowed for a barcode to be called a cell
32+
MAX_MITO_PCT = 100.0
33+
2334
# Minimum number of UMIS per barcode to consider after the initial cell calling
2435
MIN_UMIS = 500
2536

2637
# Default number of background simulations to make
2738
NUM_SIMS = 10000
2839

29-
# Minimum ratio of UMIs to the median (initial cell call UMI) to consider after the initial cell calling
30-
MIN_UMI_FRAC_OF_MEDIAN = 0.01
31-
32-
# Maximum adjusted p-value to call a barcode as non-ambient
33-
MAX_ADJ_PVALUE = 0.01
34-
3540
# Minimum number of UMIS per barcode to consider after the initial cell calling for targeted GEX
3641
TARGETED_CC_MIN_UMIS_ADDITIONAL_CELLS = 10
3742

@@ -103,6 +108,15 @@ class NonAmbientBarcodeResult(NamedTuple):
103108
pvalues: np.ndarray # pvalues (n)
104109
pvalues_adj: np.ndarray # B-H adjusted pvalues (n)
105110
is_nonambient: np.ndarray # Boolean nonambient calls (n)
111+
emptydrops_minimum_umis: int # Min UMI threshold for empty drops (1)
112+
113+
114+
def get_empty_drops_fdr(chemistry_description: str) -> float:
115+
"""Gets the maximum adjusted p-value to call a barcode as non-ambient."""
116+
# The chips used with V4 have roughly double the GEMs as the older V3 chips
117+
v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
118+
v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
119+
return 0.001 if chemistry_description in v4_chem_names else 0.01
106120

107121

108122
def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None) -> tuple[int, int]:
@@ -115,8 +129,13 @@ def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None)
115129
low_index:
116130
high_index:
117131
"""
132+
# The chips used with V4 have roughly double the GEMs as the older V3 chips
133+
v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
134+
v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
118135
if chemistry_description == CHEMISTRY_SC3P_LT[CHEMISTRY_DESCRIPTION_FIELD]:
119136
N_PARTITIONS = 9000
137+
elif chemistry_description in v4_chem_names:
138+
N_PARTITIONS = 80000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 160000
120139
else:
121140
N_PARTITIONS = 45000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 90000
122141
return (N_PARTITIONS // 2, N_PARTITIONS)
@@ -128,9 +147,7 @@ def find_nonambient_barcodes(
128147
chemistry_description,
129148
num_probe_bcs,
130149
*,
131-
min_umi_frac_of_median=MIN_UMI_FRAC_OF_MEDIAN,
132150
emptydrops_minimum_umis=MIN_UMIS,
133-
max_adj_pvalue=MAX_ADJ_PVALUE,
134151
num_sims=NUM_SIMS,
135152
):
136153
"""Call barcodes as being sufficiently distinct from the ambient profile.
@@ -148,6 +165,7 @@ def find_nonambient_barcodes(
148165
bc_order = np.argsort(umis_per_bc)
149166

150167
low, high = get_empty_drops_range(chemistry_description, num_probe_bcs)
168+
max_adj_pvalue = get_empty_drops_fdr(chemistry_description)
151169

152170
# Take what we expect to be the barcodes associated w/ empty partitions.
153171
print(f"Range empty barcodes: {low} - {high}")
@@ -186,14 +204,11 @@ def find_nonambient_barcodes(
186204
eval_bcs = np.ma.array(np.arange(matrix.bcs_dim))
187205
eval_bcs[orig_cells] = ma.masked
188206

189-
median_initial_umis = np.median(umis_per_bc[orig_cells])
190-
min_umis = int(
191-
max(emptydrops_minimum_umis, np.ceil(median_initial_umis * min_umi_frac_of_median))
192-
)
193-
print(f"Median UMIs of initial cell calls: {median_initial_umis}")
194-
print(f"Min UMIs: {min_umis}")
207+
max_background_umis = np.max(umis_per_bc[empty_bcs], initial=0)
208+
emptydrops_minimum_umis = max(emptydrops_minimum_umis, 1 + max_background_umis)
209+
print(f"Max background UMIs: {max_background_umis}")
195210

196-
eval_bcs[umis_per_bc < min_umis] = ma.masked
211+
eval_bcs[umis_per_bc < emptydrops_minimum_umis] = ma.masked
197212
n_unmasked_bcs = len(eval_bcs) - eval_bcs.mask.sum()
198213

199214
eval_bcs = np.argsort(ma.masked_array(umis_per_bc, mask=eval_bcs.mask))[0:n_unmasked_bcs]
@@ -208,6 +223,7 @@ def find_nonambient_barcodes(
208223
return None
209224

210225
assert not np.any(np.isin(eval_bcs, orig_cells))
226+
assert not np.any(np.isin(eval_bcs, empty_bcs))
211227
print(f"Number of candidate bcs: {len(eval_bcs)}")
212228
print(f"Range candidate bc umis: {umis_per_bc[eval_bcs].min()}, {umis_per_bc[eval_bcs].max()}")
213229

@@ -234,6 +250,7 @@ def find_nonambient_barcodes(
234250

235251
pvalues_adj = adjust_pvalue_bh(pvalues)
236252

253+
print(f"Max adjusted P-value: {max_adj_pvalue}")
237254
is_nonambient = pvalues_adj <= max_adj_pvalue
238255

239256
return NonAmbientBarcodeResult(
@@ -242,4 +259,5 @@ def find_nonambient_barcodes(
242259
pvalues=pvalues,
243260
pvalues_adj=pvalues_adj,
244261
is_nonambient=is_nonambient,
262+
emptydrops_minimum_umis=emptydrops_minimum_umis,
245263
)

0 commit comments

Comments
 (0)