Skip to content

Commit

Permalink
ADD symmetric tomtom
Browse files Browse the repository at this point in the history
  • Loading branch information
jmschrei committed Oct 31, 2024
1 parent 691d77a commit 6c1da9f
Show file tree
Hide file tree
Showing 8 changed files with 622 additions and 136 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"\n",
"Marginalization is the process of substituting in a motif and observing the change in model output before and after the substitution. Usually, this substitution is done into a background region so that one can estimate the \"marginal\" effect that each motif has on model predictions in isolation, and this value is averaged over many background sequencces.\n",
"\n",
"An important complication in using marginalizations is the choice of background. <b><i>Uniformly generated random background sequences are likely much higher in GC content than the genome these models were trained on</i></b> and so they may not be the best choice, although they are very convenient to use. The best choice is usually to use regions of the genome that are known to not exhibit the activity you care about but have similar GC content, e.g., regions that are not peaks, regions that are not accessible, non-gene body non-promoter regions, etc. The second best choice is to use randomly generated sequences with proportions of each character derived from the genome. As a reference: for `chr1` after excluding N's these proportions are `[0.2910, 0.2085, 0.2087, 0.2918]`. When prototyping, it is okay to use uniformly randomly generated sequences, but make sure to switch before presenting your results. <i><b>Models can be very sensitive to local GC content and so when you substitute a motif with a very different content you may get a \"mirage\" that is driven entirely by this local change in GC content.</i></b>\n",
"An important complication in using marginalizations is the choice of background. <b><i>Uniformly generated random background sequences are likely much higher in GC content than the genome these models were trained on</i></b> and so they may not be the best choice, although they are very convenient to use. The best choice is usually to use regions of the genome that are known to not exhibit the activity you care about but have similar GC content and dinucleotide content, e.g., regions that are not peaks, regions that are not accessible, non-gene body non-promoter regions, etc. The second best choice is to use randomly generated sequences with proportions of each character derived from the genome. As a reference: for `chr1` after excluding N's these proportions are `[0.2910, 0.2085, 0.2087, 0.2918]`. When prototyping, it is okay to use uniformly randomly generated sequences, but make sure to switch before presenting your results. <i><b>Models can be very sensitive to local GC content and can also be sensitive to dinucleotide content and so when you substitute a motif with a very different content you may get a \"mirage\" that is driven entirely by this local change in GC content.</i></b>\n",
"\n",
"In general, more background sequences is better as long as you have the compute for it but this betterness drops off quickly after around 100 sequences. "
]
Expand Down
2 changes: 1 addition & 1 deletion tangermeme/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def plot_logo(X_attr, ax, color=None, annotations=None, start=None, end=None,
elif show_extra:
s = motif_start

motifs[motif_start:motif_start+len(motif)*2, i] = 1
motifs[motif_start:motif_start+len(str(motif))*2, i] = 1
y_offset += -0.1 + 0.2*(n_tracks) + 0.1*(i-n_tracks)

plt.text(motif_start, -ylim*(y_offset+0.1), motif,
Expand Down
11 changes: 6 additions & 5 deletions tangermeme/seqlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def _laplacian_null(X_sum, num_to_samp=10000, random_state=1234):

prob_pos = len(pos_values) / len(values)
urand = torch.from_numpy(numpy.random.RandomState(random_state).uniform(
size=(num_to_samp, 2))).to(X_sum.device)
size=(num_to_samp, 2)))

icdf = numpy.log(1 - urand[:, 1])

Expand Down Expand Up @@ -210,7 +210,7 @@ def _isotonic_thresholds(values, null_values, increasing, target_fdr,

def tfmodisco_seqlets(X_attr, window_size=21, flank=10, target_fdr=0.2,
min_passing_frac=0.03, max_passing_frac=0.2,
weak_threshold_for_counting_sign=0.8, device='cuda'):
weak_threshold_for_counting_sign=0.8):
"""Extract seqlets using the procedure from TF-MoDISco.
Seqlets are contiguous spans of high attribution characters. This method
Expand Down Expand Up @@ -280,7 +280,7 @@ def tfmodisco_seqlets(X_attr, window_size=21, flank=10, target_fdr=0.2,
values = X_sum.flatten()
if len(values) > 1000000:
values = torch.from_numpy(numpy.random.RandomState(1234).choice(
a=values, size=1000000, replace=False)).to(device)
a=values, size=1000000, replace=False))

pos_values = values[values >= 0]
neg_values = values[values < 0]
Expand Down Expand Up @@ -315,8 +315,9 @@ def tfmodisco_seqlets(X_attr, window_size=21, flank=10, target_fdr=0.2,
X_sum[idxs] = numpy.abs(X_sum[idxs])
X_sum[~idxs] = -numpy.inf

X_sum[:, :flank] = -numpy.inf
X_sum[:, -flank:] = -numpy.inf
if flank > 0:
X_sum[:, :flank] = -numpy.inf
X_sum[:, -flank:] = -numpy.inf

seqlets = _iterative_extract_seqlets(X_sum=X_sum, window_size=window_size,
flank=flank, suppress=suppress)
Expand Down
178 changes: 100 additions & 78 deletions tangermeme/tools/fimo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@

from tqdm import tqdm


LOG_2 = math.log(2)

@numba.njit('float64(float64, float64)', cache=True)
@numba.njit('float64(float64, float64)', fastmath=True, cache=True)
def logaddexp2(x, y):
"""Calculate the logaddexp in a numerically stable manner in base 2.
Expand All @@ -41,36 +38,17 @@ def logaddexp2(x, y):
The result of log2(pow(2, x) + pow(2, y))
"""

vmax, vmin = max(x, y), min(x, y)
return vmax + math.log(math.pow(2, vmin - vmax) + 1) / LOG_2


@numba.njit('void(int32[:, :], float64[:], float64[:], int32, int32, float64)',
cache=True)
def _fast_pwm_to_cdf(int_log_pwm, old_logpdf, logpdf, alphabet_length,
seq_length, log_bg):
"""A fast internal function for running the dynamic programming algorithm.
This function is written in numba to speed up the dynamic programming used
to convert score bins into log p-values. This is not meant to be used
externally.
"""

for i in range(1, seq_length):
logpdf[:] = -numpy.inf

for j, x in enumerate(old_logpdf):
if x != -numpy.inf:
for k in range(alphabet_length):
offset = int_log_pwm[i, k]
if x == float("-inf") and y == float("-inf"):
return float("-inf")

v1 = logpdf[j + offset]
v2 = log_bg + x
logpdf[j + offset] = logaddexp2(v1, v2)
if x == float("inf") or y == float("inf"):
return float("inf")

old_logpdf[:] = logpdf
vmax, vmin = max(x, y), min(x, y)
return vmax + math.log2(math.pow(2, vmin - vmax) + 1)


@numba.njit(cache=True)
def _pwm_to_mapping(log_pwm, bin_size):
"""An internal method for calculating score <-> log p-value mappings.
Expand Down Expand Up @@ -108,33 +86,72 @@ def _pwm_to_mapping(log_pwm, bin_size):
associated with each score bin.
"""

n, l = log_pwm.shape

log_bg = math.log2(0.25)
int_log_pwm = numpy.round(log_pwm / bin_size).astype(numpy.int32).T.copy()

smallest = int(numpy.min(numpy.cumsum(numpy.min(int_log_pwm, axis=-1),
axis=-1)))
largest = int(numpy.max(numpy.cumsum(numpy.max(int_log_pwm, axis=-1),
axis=-1))) + log_pwm.shape[1]

logpdf = -numpy.inf * numpy.ones(largest - smallest + 1)
for i in range(log_pwm.shape[0]):
idx = int_log_pwm[0, i] - smallest
logpdf[idx] = numpy.logaddexp2(logpdf[idx], log_bg)

old_logpdf = logpdf.copy()
logpdf[:] = 0

_fast_pwm_to_cdf(int_log_pwm, old_logpdf, logpdf, log_pwm.shape[0],
log_pwm.shape[1], log_bg)

log1mcdf = logpdf.copy()
int_log_pwm = numpy.round(log_pwm / bin_size).astype(numpy.int32)

smallest, largest = 9999999, -9999999
log_pwm_min_csum, log_pwm_max_csum = 0, 0
for i in range(l):
log_pwm_min = 9999999
log_pwm_max = -9999999

for j in range(n):
log_pwm_min = min(log_pwm_min, int_log_pwm[j, i])
log_pwm_max = max(log_pwm_max, int_log_pwm[j, i])

log_pwm_min_csum += log_pwm_min
log_pwm_max_csum += log_pwm_max

smallest = min(smallest, log_pwm_min_csum)
largest = max(largest, log_pwm_max_csum)

largest += l

logpdf = numpy.empty(largest - smallest + 1)
old_logpdf = -numpy.inf * numpy.ones(largest - smallest + 1)
for i in range(n):
idx = int_log_pwm[i, 0] - smallest
old_logpdf[idx] = logaddexp2(old_logpdf[idx], log_bg)

for i in range(1, l):
for j in range(largest - smallest + 1):
logpdf[j] = -numpy.inf

for j, x in enumerate(old_logpdf):
if x != -numpy.inf:
for k in range(n):
idx = j + int_log_pwm[k, i]
logpdf[idx] = logaddexp2(logpdf[idx], log_bg + x)

for j in range(largest - smallest + 1):
old_logpdf[j] = logpdf[j]

for i in range(len(logpdf) - 2, -1, -1):
log1mcdf[i] = numpy.logaddexp2(log1mcdf[i], log1mcdf[i + 1])
logpdf[i] = logaddexp2(logpdf[i], logpdf[i + 1])

return smallest, logpdf


@numba.njit(parallel=True, cache=True)
def _all_pwm_to_mapping(motifs, motif_lengths, bin_size):
n = len(motif_lengths) - 1

smallests = numpy.empty(n, dtype='int64')
logpdfs = [numpy.empty(0) for i in range(n)]

for i in numba.prange(n):
s, e = motif_lengths[i], motif_lengths[i+1]

return smallest, log1mcdf
smallest, logpdf = _pwm_to_mapping(motifs[:, s:e], bin_size)
smallests[i] = smallest
logpdfs[i] = logpdf

return smallests, logpdfs

@numba.njit(parallel=True, fastmath=True)

@numba.njit(parallel=True, fastmath=True, cache=True)
def _fast_hits(X, chrom_lengths, pwm, pwm_lengths, score_threshold, bin_size,
smallest, score_to_pvals, score_to_pval_lengths):
n_motifs = len(pwm_lengths) - 1
Expand Down Expand Up @@ -180,14 +197,15 @@ def _fast_hits(X, chrom_lengths, pwm, pwm_lengths, score_threshold, bin_size,
return hits


@numba.njit
@numba.njit(cache=True)
def _fast_convert(X, mapping):
for i in range(X.shape[0]):
X[i] = mapping[X[i]]


def fimo(motifs, sequences, alphabet=['A', 'C', 'G', 'T'], bin_size=0.1,
eps=0.0001, threshold=0.0001, reverse_complement=True, dim=0):
eps=0.0001, threshold=0.0001, reverse_complement=True, return_counts=False,
dim=0):
"""An implementation of the FIMO algorithm from the MEME suite.
This function implements the "Finding Individual Motif Instances" (FIMO)
Expand Down Expand Up @@ -234,6 +252,11 @@ def fimo(motifs, sequences, alphabet=['A', 'C', 'G', 'T'], bin_size=0.1,
Whether to scan each motif and also the reverse complements. Default
is True.
return_counts: bool, optioal
Whether to only return the count of the number of matches instead of
dataframes containing information about each match. If True, the return
will be a single array. Default is False
dim: 0 or 1, optional
Whether to return one dataframe for each motif containing all hits for
that motif across all examples (0, default) or one dataframe for each
Expand All @@ -243,12 +266,13 @@ def fimo(motifs, sequences, alphabet=['A', 'C', 'G', 'T'], bin_size=0.1,
Returns
-------
hits: list of pandas.DataFrames
hits: list of pandas.DataFrames or numpy.ndarray
A list of pandas.DataFrames containing motif hits, where the exact
semantics of each dataframe are determined by `dim`.
semantics of each dataframe are determined by `dim`. Alternatively,
a numpy array of just the number of counts per motif if return_counts
is set to True.
"""

tic = time.time()
log_threshold = math.log2(threshold)

# Extract the motifs and potentially the reverse complements
Expand All @@ -267,36 +291,28 @@ def fimo(motifs, sequences, alphabet=['A', 'C', 'G', 'T'], bin_size=0.1,

# Initialize arrays to store motif properties
n_motifs = len(motifs)
motif_pwms, motif_names, motif_lengths = [], [], [0]
_score_to_pvals, _score_to_pvals_lengths = [], [0]

_smallest = numpy.empty(n_motifs, dtype=numpy.int32)
_score_thresholds = numpy.empty(n_motifs, dtype=numpy.float32)
motif_names = numpy.array([name for name, _ in motifs])
motif_lengths = [0] + [pwm.shape[-1] for _, pwm in motifs]
motif_lengths = numpy.cumsum(motif_lengths).astype(numpy.uint64)

# Fill out these motif properties
for i, (name, motif) in enumerate(motifs):
motif_names.append(name)
motif_lengths.append(motif.shape[-1])

motif_pwm = numpy.log2(motif + eps) - math.log2(0.25)
motif_pwms.append(motif_pwm)
motif_pwms = numpy.concatenate([pwm for _, pwm in motifs], axis=-1)
motif_pwms = numpy.log2(motif_pwms + eps) - math.log2(0.25)

smallest, mapping = _pwm_to_mapping(motif_pwm, bin_size)
_smallest[i] = smallest
_score_to_pvals.append(mapping)
_score_to_pvals_lengths.append(len(mapping))
_smallest, _score_to_pvals = _all_pwm_to_mapping(motif_pwms, motif_lengths,
bin_size)
_score_to_pvals_lengths = [0]
_score_thresholds = numpy.empty(n_motifs, dtype=numpy.float32)

for i in range(n_motifs):
_score_to_pvals_lengths.append(len(_score_to_pvals[i]))

idx = numpy.where(_score_to_pvals[i] < log_threshold)[0]
if len(idx) > 0:
_score_thresholds[i] = (idx[0] + smallest) * bin_size
_score_thresholds[i] = (idx[0] + _smallest[i]) * bin_size
else:
_score_thresholds[i] = float("inf")

# Convert these back to numpy arrays
motif_pwms = numpy.concatenate(motif_pwms, axis=-1)
motif_names = numpy.array(motif_names)
motif_lengths = numpy.cumsum(motif_lengths).astype(numpy.uint64)

_score_to_pvals = numpy.concatenate(_score_to_pvals)
_score_to_pvals_lengths = numpy.cumsum(_score_to_pvals_lengths)

Expand Down Expand Up @@ -344,6 +360,12 @@ def fimo(motifs, sequences, alphabet=['A', 'C', 'G', 'T'], bin_size=0.1,
names = ['sequence_name', 'start', 'end', 'score', 'p-value']
n_ = n_motifs // 2 if reverse_complement else n_motifs

if return_counts == True:
counts = numpy.zeros(n_, dtype='int32')
for i in range(n_):
counts[i] = len(hits[i]) + len(hits[i+n_])
return counts

for i in range(n_):
if reverse_complement:
hits_ = pandas.DataFrame(hits[i] + hits[i + n_], columns=names)
Expand Down
Loading

0 comments on commit 6c1da9f

Please sign in to comment.