Skip to content

Commit 6229bcf

Browse files
authored
Merge pull request #28 from CAnBioNet/nested-subsampling
Refactor subsample creation and implement nested subsampling
2 parents 9c21dc6 + 5268084 commit 6229bcf

File tree

1 file changed

+103
-26
lines changed

1 file changed

+103
-26
lines changed

reconstruction/create_subsamples.py

Lines changed: 103 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python
22
import argparse
3+
import itertools
34
import json
45
import numpy
56
from pathlib import Path
@@ -15,8 +16,9 @@ def getArgs():
1516
requiredArgGroup.add_argument("--subsample-file", type=str, dest="subsampleFile", required=True, help="File to write the subsample descriptions to")
1617
requiredArgGroup.add_argument("-p", "--proportion", type=float, required=True, help="Proportion of samples to put in each subsample. Must satisfy 0 < p < 1.")
1718
requiredArgGroup.add_argument("-n", "--nsubsamples", type=int, required=True, help="Number of subsamples to generate.")
18-
optionalArgGroup.add_argument("-s", "--singlecell", action="store_true", default=False, help="Create subsamples for single-cell data instead of aggregate data")
19+
optionalArgGroup.add_argument("-s", "--singlecell", action="store_true", default=False, help="Create subsamples for single-cell data instead of aggregate data. By default, this will subsample cells rather than organisms. To override this behavior, use the -o/--organism option to subsample organisms or the -b/--both-organisms-and-cells option to subsample both.")
1920
optionalArgGroup.add_argument("-o", "--organism", action="store_true", default=False, help="Subsample over organisms, even for single-cell data (this is the default behavior for aggregate data)")
21+
optionalArgGroup.add_argument("-b", "--both-organisms-and-cells", dest="bothOrganismsAndCells", nargs=1, metavar="ORGANISM_PROPORTION", help="Subsample over organisms and cells (only applicable to single-cell, -s). The additional argument is the proportion of organisms in each treatment to use for each subsample and must satisfy 0 < p < 1.")
2022
optionalArgGroup.add_argument("-h", "--help", action="help", help="Show this help message and exit")
2123
args = parser.parse_args()
2224

@@ -29,6 +31,24 @@ def getArgs():
2931
if args.proportion <= 0 or args.proportion >= 1:
3032
raise Exception("Proportion of samples out of bounds. Must satisfy 0 < p < 1.")
3133

34+
bothOrganismsAndCells = args.bothOrganismsAndCells is not None
35+
36+
if args.singlecell:
37+
if args.organism and bothOrganismsAndCells:
38+
raise Exception("Cannot use the -o/--organism and -b/--both-organisms-and-cells options simultaneously.")
39+
else:
40+
if args.organism:
41+
raise Exception("Can only use -o/--organism with single-cell data (-s/--singlecell)")
42+
if bothOrganismsAndCells:
43+
raise Exception("Can only use -b/--both-organisms-and-single-cells with single-cell data (-s/--singlecell).")
44+
45+
if bothOrganismsAndCells:
46+
args.organismProportion = float(args.bothOrganismsAndCells[0])
47+
if args.organismProportion <= 0 or args.organismProportion >= 1:
48+
raise Exception("Proportion of organisms out of bounds. Must satisfy 0 < p < 1.")
49+
50+
args.bothOrganismsAndCells = bothOrganismsAndCells
51+
3252
return args
3353

3454
if __name__ == "__main__":
@@ -40,37 +60,94 @@ def getArgs():
4060
subsamples = []
4161
if args.singlecell:
4262
data = dataset.get_table("cellData")
43-
if args.organism:
44-
organisms = numpy.array(list(set(data["organism"].data)))
45-
nAllSamples = len(organisms)
46-
nSamples = round(args.proportion * nAllSamples)
63+
64+
# Construct maps from dataset coordinates
65+
organismCellMap = {} # organism -> list of indices of cells belonging to the organism
66+
experimentMap = {} # experiment -> list of organisms in experiment
67+
treatmentMap = {} # treatment -> list of organisms in treatment group
68+
cellTypeMap = {} # cell type -> list of indices of cells of that type
69+
for index, cellData in enumerate(data.cell):
70+
organism = cellData.organism.item()
71+
if organism not in organismCellMap:
72+
organismCellMap[organism] = []
73+
74+
experiment = cellData.experiment.item()
75+
if experiment not in experimentMap:
76+
experimentMap[experiment] = []
77+
experimentMap[experiment].append(organism)
78+
79+
treatment = cellData.treatment.item()
80+
if treatment not in treatmentMap:
81+
treatmentMap[treatment] = []
82+
treatmentMap[treatment].append(organism)
83+
84+
cellType = cellData.cellType.item()
85+
if cellType not in cellTypeMap:
86+
cellTypeMap[cellType] = []
87+
cellTypeMap[cellType].append(index)
88+
organismCellMap[organism].append(index)
89+
90+
def subsampleCells(organisms):
91+
subsample = []
92+
for organism in organisms:
93+
organismCellIndices = set(organismCellMap[organism])
94+
for cellType in cellTypeMap:
95+
cellTypeCellIndices = set(cellTypeMap[cellType])
96+
matchingCellIndices = organismCellIndices.intersection(cellTypeCellIndices)
97+
subsetSubsampleSize = round(args.proportion * len(matchingCellIndices))
98+
subsetSubsample = random.sample(matchingCellIndices, subsetSubsampleSize)
99+
subsample.extend(subsetSubsample)
100+
subsample.sort()
101+
return subsample
102+
103+
if args.organism or args.bothOrganismsAndCells:
104+
# Take a sample of the organisms from each treatment
105+
organismProportion = args.proportion if args.organism else args.organismProportion
47106
for i in range(args.nsubsamples):
48-
organismSubsample = random.sample(range(nAllSamples), nSamples)
49-
cellSubsample = []
50-
for organismIndex in organismSubsample:
51-
organism = organisms[organismIndex]
52-
organismCellIndices = numpy.argwhere((data.organism == organism).data).flatten().tolist()
53-
cellSubsample.extend(organismCellIndices)
54-
cellSubsample.sort()
55-
subsamples.append(cellSubsample)
107+
organismSubsample = []
108+
for experimentOrganisms in experimentMap.values():
109+
for treatmentOrganisms in treatmentMap.values():
110+
matchingOrganisms = set(experimentOrganisms).intersection(set(treatmentOrganisms))
111+
subsetSubsampleSize = round(organismProportion * len(matchingOrganisms))
112+
subsetSubsample = random.sample(matchingOrganisms, subsetSubsampleSize)
113+
organismSubsample.extend(subsetSubsample)
114+
if args.organism:
115+
subsample = list(itertools.chain(*[organismCellMap[organism] for organism in organismSubsample]))
116+
subsample.sort()
117+
subsamples.append(subsample)
118+
elif args.bothOrganismsAndCells:
119+
# Now that we have a sample of the organisms, take a sample of each selected organism's cells within each cell type
120+
subsamples.append(subsampleCells(organismSubsample))
121+
else:
122+
raise Exception("Argument state invalid.")
56123
else:
57124
for i in range(args.nsubsamples):
58-
coordSubsample = []
59-
def subsamplePopulation(data):
60-
nAllSamples = data.sizes["cell"]
61-
nSamples = round(args.proportion * nAllSamples)
62-
populationCoordSubsample = random.sample(list(data.coords["cell"].data), nSamples)
63-
coordSubsample.extend(populationCoordSubsample)
64-
return data
65-
data.groupby("organism").map(lambda individualData: individualData.groupby("cellType").map(subsamplePopulation))
66-
indexSubsample = list(numpy.transpose(numpy.argwhere(numpy.isin(data.cell, coordSubsample))).astype(object)[0])
67-
subsamples.append(indexSubsample)
125+
subsamples.append(subsampleCells(organismCellMap.keys()))
68126
else:
69127
data = dataset.get_table("originalData")
70-
nAllSamples = data.sizes["organism"]
71-
nSamples = round(args.proportion * nAllSamples)
128+
129+
# Construct maps from dataset coordinates
130+
experimentMap = {} # experiment -> list of indices of organisms in experiment
131+
treatmentMap = {} # treatment -> list of indices of organisms in treatment group
132+
for index, organism in enumerate(data.organism):
133+
experiment = organism.experiment.item()
134+
if experiment not in experimentMap:
135+
experimentMap[experiment] = []
136+
experimentMap[experiment].append(organism)
137+
138+
treatment = organism.treatment.item()
139+
if treatment not in treatmentMap:
140+
treatmentMap[treatment] = []
141+
treatmentMap[treatment].append(index)
142+
72143
for i in range(args.nsubsamples):
73-
subsample = random.sample(range(nAllSamples), nSamples)
144+
subsample = []
145+
for experimentOrganismIndices in experimentMap.values():
146+
for treatmentOrganismIndices in treatmentMap.values():
147+
matchingOrganismIndices = set(experimentOrganismIndices).intersection(set(treatmentOrganismIndices))
148+
subsetSubsampleSize = round(args.proportion * len(subsetOrganismIndices))
149+
subsetSubsample = random.sample(matchingOrganismIndices, subsetSubsampleSize)
150+
subsample.extend(subsetSubsample)
74151
subsample.sort()
75152
subsamples.append(subsample)
76153

0 commit comments

Comments
 (0)