Skip to content

Commit af2fe66

Browse files
yuyuan871111cbouylilyminium
authored
Update ResidueGroup/SegmentGroup with ids (PDBParser: force chainid to segid / Universe: set groups) (#4965)
* Fixes #4948 #2874 * Universe: add a function in the universe set_groups to update the topology and groups in the universe when atomwise resids/segids is given (more generalized method across different file type) * PDBParser: add an argument force_chainids_to_segids in PDBReader to forcibly reading the chain ID into the segment ID (PDB specific method) * update docs * add tests * update CHANGELOG --------- Co-authored-by: Cédric Bouysset <bouysset.cedric@gmail.com> Co-authored-by: Lily Wang <31115101+lilyminium@users.noreply.github.com>
1 parent 740332c commit af2fe66

File tree

6 files changed

+614
-6
lines changed

6 files changed

+614
-6
lines changed

package/CHANGELOG

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ The rules for this file:
1515

1616
-------------------------------------------------------------------------------
1717
??/??/?? IAlibay, orbeckst, BHM-Bob, TRY-ER, Abdulrahman-PROG, pbuslaev,
18-
yuxuanzhuang
18+
yuxuanzhuang, yuyuan871111
1919

2020
* 2.10.0
2121

@@ -34,6 +34,14 @@ Enhancements
3434
* Improve parsing of topology information from LAMMPS dump files to allow
3535
reading of mass, charge and element attributes. (Issue #3449, PR #4995)
3636
* Added timestep support for XDR writers and readers (Issue #4905, PR #4908)
37+
* New feature in `Universe`: `set_groups`. This function allows updating
38+
`_topology`, `residues` (ResidueGroup), `segments` (SegmentGroup) in
39+
the `Universe` using user-provided atomwise `resids` and/or `segids`.
40+
(Issue #4948 #2874, PR #4965)
41+
* Adds an argument `force_chainids_to_segids` in `PDBParser`: this will
42+
force the use of ChainID as the segID when reading PDB (it is helpful
43+
if the segment IDs (segids) are partially missing in the PDB file).
44+
(Issue #4948 #2874, PR #4965)
3745

3846
Changes
3947

package/MDAnalysis/coordinates/PDB.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,11 @@ class PDBReader(base.ReaderBase):
244244
:class:`PDBWriter`
245245
:class:`PDBReader`
246246
247+
If you would like to force the use of chainID as the segID when parsing PDB,
248+
please use the keyword argument `force_chainids_to_segids=True`
249+
(:class:`MDAnalysis.topology.PDBParser.PDBParser`).
250+
This will prioritize the chain ID to the segment ID.
251+
247252
248253
.. versionchanged:: 0.11.0
249254
* Frames now 0-based instead of 1-based

package/MDAnalysis/core/universe.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
from .topology import Topology
8787
from .topologyattrs import (
8888
AtomAttr, ResidueAttr, SegmentAttr,
89+
Segindices, Segids, Resindices, Resids, Atomindices,
8990
BFACTOR_WARNING, _Connection
9091
)
9192
from .topologyobjects import TopologyObject
@@ -94,6 +95,105 @@
9495
logger = logging.getLogger("MDAnalysis.core.universe")
9596

9697

98+
def _update_topology_by_ids(universe, atomwise_resids, atomwise_segids):
99+
"""Update the topology of a Universe with new atomwise resids and segids.
100+
101+
Parameters
102+
----------
103+
universe : Universe
104+
The universe to update.
105+
atomwise_resids : numpy.ndarray
106+
The new atomwise residue indices.
107+
atomwise_segids : numpy.ndarray
108+
The new atomwise segment indices.
109+
"""
110+
from ..topology.base import change_squash
111+
112+
# the original topology
113+
top = universe._topology
114+
115+
# detect the atom level attributes (excluding residue level and above)
116+
atom_attrindices = [
117+
idx
118+
for idx, each_attr in enumerate(top.attrs)
119+
if (Residue not in each_attr.target_classes) and
120+
(not isinstance(each_attr, Atomindices))
121+
]
122+
attrs = [top.attrs[each_attr] for each_attr in atom_attrindices]
123+
124+
# create new residues level stuff
125+
residue_attrindices = [
126+
idx
127+
for idx, each_attr in enumerate(top.attrs)
128+
if (
129+
(Residue in each_attr.target_classes) and
130+
(Segment not in each_attr.target_classes) and
131+
(not isinstance(each_attr, Resids)) and
132+
(not isinstance(each_attr, Resindices))
133+
)
134+
] # residue level attributes except resids and resindices
135+
136+
res_criteria = [atomwise_resids, atomwise_segids] + [
137+
getattr(universe.atoms, top.attrs[each_attr].attrname)
138+
for each_attr in residue_attrindices
139+
if top.attrs[each_attr].attrname != 'resnums'
140+
]
141+
142+
res_to_squash = [atomwise_resids, atomwise_segids] + [
143+
getattr(universe.atoms, top.attrs[each_attr].attrname)
144+
for each_attr in residue_attrindices
145+
]
146+
147+
residx, res_squashed = change_squash(res_criteria, res_to_squash)
148+
resids = res_squashed[0]
149+
res_squashed_segids = res_squashed[1]
150+
n_residues = len(resids)
151+
# all residue-level attributes except resids
152+
res_squashed_res_attrs = res_squashed[2:]
153+
154+
res_attrs = [Resids(resids)] + [
155+
# use the correspdoning type of the attribute with new sqaushed values
156+
top.attrs[each_attr].__class__(res_squashed_res_attrs[idx])
157+
for idx, each_attr in enumerate(residue_attrindices)
158+
]
159+
attrs.extend(res_attrs)
160+
161+
# create new segment level stuff
162+
segidx, (segids,) = change_squash((res_squashed_segids,), (res_squashed_segids,))
163+
n_segments = len(segids)
164+
attrs.append(Segids(segids))
165+
166+
# other attributes
167+
other_attrs = [
168+
each_attr
169+
for each_attr in top.attrs
170+
if (
171+
(Segment in each_attr.target_classes) and
172+
(not isinstance(each_attr, Segids)) and
173+
(not isinstance(each_attr, Atomindices)) and
174+
(not isinstance(each_attr, Resindices)) and
175+
(not isinstance(each_attr, Segindices))
176+
)
177+
]
178+
179+
# create new topology
180+
top = Topology(
181+
universe.atoms.n_atoms,
182+
n_residues,
183+
n_segments,
184+
attrs=attrs,
185+
atom_resindex=residx,
186+
residue_segindex=segidx,
187+
)
188+
189+
# add back other attributes
190+
if len(other_attrs) > 0:
191+
for each_otherattr in other_attrs:
192+
top.add_TopologyAttr(each_otherattr)
193+
194+
# update the topology in the universe
195+
universe._topology = top
196+
97197

98198
def _check_file_like(topology):
99199
if isstream(topology):
@@ -312,6 +412,12 @@ class Universe(object):
312412
functionality to treat independent trajectory files as a single virtual
313413
trajectory.
314414
**kwargs: extra arguments are passed to the topology parser.
415+
For instance, when reading a PDB file
416+
(:class:`PDBReader<MDAnalysis.coordinates.PDB>`,
417+
:class:`PDBParser<MDAnalysis.topology.PDBParser>`), set
418+
``force_chainids_to_segids=True`` to make the universe use the
419+
chainIDs (column 22) instead of the segmentIDs (column 73-76) as the
420+
`segids` in the universe and select the corresponding SegmentGroup.
315421
316422
Attributes
317423
----------
@@ -380,6 +486,10 @@ class Universe(object):
380486
guessing masses and atom types after topology
381487
is read from a registered parser.
382488
489+
.. versionchanged:: 2.10.0
490+
Added :meth: `~MDAnalysis.core.universe.Universe.set_groups`
491+
API to set residues/segments based on the atomwise resids/segids.
492+
383493
"""
384494
def __init__(self, topology=None, *coordinates, all_coordinates=False,
385495
format=None, topology_format=None, transformations=None,
@@ -1700,6 +1810,104 @@ def guess_TopologyAttrs(
17001810
warnings.warn('Can not guess attributes '
17011811
'for universe with 0 atoms')
17021812

1813+
def set_groups(self, atomwise_resids=None, atomwise_segids=None):
1814+
"""Set the groups (`ResidueGroup`, `SegmentGroup`) of the Universe
1815+
by atomwise resids/segids.
1816+
1817+
The `topology` will also be updated based on the provided `atomwise_resids`
1818+
and `atomwise_segids`. The original `resids` and `segids` will be stored
1819+
in attributes `atomwise_resids_orig` and/or `atomwise_segids_orig` if
1820+
they are modified.
1821+
See notes for the logic of the function.
1822+
1823+
Parameters
1824+
----------
1825+
atomwise_resids:
1826+
A list of residue IDs to be set for the Universe. The length
1827+
of the list should be equal to the number of atoms in the Universe.
1828+
If `None`, the original resids will be used.
1829+
1830+
atomwise_segids:
1831+
A list of segment IDs to be set for the Universe. The length
1832+
of the list should be equal to the number of atoms in the Universe.
1833+
If `None`, the original segids will be used.
1834+
1835+
Raises
1836+
------
1837+
AssertionError
1838+
If the length of the provided atomwise_resids or atomwise_segids
1839+
does not match the number of atoms in the Universe.
1840+
1841+
Notes
1842+
-----
1843+
First, the function will check if resids or segids is provided.
1844+
If both resids and segids are not provided (`None`), it will do nothing.
1845+
If only one of them is provided, it will use the original values for the
1846+
other one. If both are provided, it will use the provided values for
1847+
both resids and segids.
1848+
The function will then update the topology by a new generated topology
1849+
with new values of the resids and segids.
1850+
Finally, the corresponding new `ResidueGroup` and `SegmentGroup` will be
1851+
created by the updated topology.
1852+
1853+
Examples
1854+
--------
1855+
To set custom segment IDs for the segments of the Universe::
1856+
1857+
atomwise_segids = ['A', 'A', 'B', 'B']
1858+
u.set_groups(atomwise_segids=atomwise_segids)
1859+
1860+
# Now the Universe has two segments with segIDs 'A' and 'B'
1861+
u.segments
1862+
>>> <SegmentGroup with 2 segments>
1863+
1864+
.. versionadded:: 2.10.0
1865+
"""
1866+
if (atomwise_resids is None) and (atomwise_segids is None):
1867+
warnings.warn("Not setting groups. Please provide atomwise_resids or "
1868+
"atomwise_segids.")
1869+
return
1870+
1871+
# resids
1872+
if atomwise_resids is None:
1873+
atomwise_resids = self.atoms.resids
1874+
1875+
else:
1876+
# check the length of atomwise_resids
1877+
if len(atomwise_resids) != self.atoms.n_atoms:
1878+
raise ValueError(
1879+
"The length of atomwise_resids should be the same as "
1880+
"the number of atoms in the universe.")
1881+
1882+
self.atomwise_resids_orig = self.atoms.resids
1883+
logger.info("The new resids replaces the current one. "
1884+
"The original resids is stored in "
1885+
"atomwise_resids_orig.")
1886+
1887+
# segids
1888+
if atomwise_segids is None:
1889+
atomwise_segids = self.atoms.segids
1890+
1891+
else:
1892+
# check the length of atomwise_segids
1893+
if len(atomwise_segids) != self.atoms.n_atoms:
1894+
raise ValueError(
1895+
"The length of atomwise_segids should be the same as "
1896+
"the number of atoms in the universe.")
1897+
1898+
self.atomwise_segids_orig = self.atoms.segids
1899+
logger.info("The new resids replaces the current one. "
1900+
"The original segids is stored in "
1901+
"atomwise_segids_orig.")
1902+
1903+
atomwise_resids = np.array(atomwise_resids, dtype=int)
1904+
atomwise_segids = np.array(atomwise_segids, dtype=object)
1905+
1906+
_update_topology_by_ids(self,
1907+
atomwise_resids=atomwise_resids,
1908+
atomwise_segids=atomwise_segids)
1909+
_generate_from_topology(self)
1910+
17031911

17041912
def Merge(*args):
17051913
"""Create a new new :class:`Universe` from one or more

package/MDAnalysis/topology/PDBParser.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,11 @@ class PDBParser(TopologyReaderBase):
178178
- bonds
179179
- formalcharges
180180
181+
Note that `PDBParser` accepts an optional keyword argument
182+
``force_chainids_to_segids``. If set to ``True``, the chain IDs (even if
183+
empty values are in the chain ID column in the file) will forcibly be used
184+
instead of the segment IDs for creating segments.
185+
181186
See Also
182187
--------
183188
:class:`MDAnalysis.coordinates.PDB.PDBReader`
@@ -207,7 +212,10 @@ class PDBParser(TopologyReaderBase):
207212
Removed type and mass guessing (attributes guessing takes place now
208213
through universe.guess_TopologyAttrs() API).
209214
.. versionchanged:: 2.10.0
210-
segID is read from 73-76 instead of 67-76.
215+
segID is read from 73-76 instead of 67-76 and added the
216+
`force_chainids_to_segids` keyword argument. Some infos in logger will
217+
be generated if the segids is not present or if the chainids are not
218+
completely equal to segids.
211219
"""
212220
format = ['PDB', 'ENT']
213221

@@ -218,7 +226,7 @@ def parse(self, **kwargs):
218226
-------
219227
MDAnalysis Topology object
220228
"""
221-
top = self._parseatoms()
229+
top = self._parseatoms(**kwargs)
222230

223231
try:
224232
bonds = self._parsebonds(top.ids.values)
@@ -235,7 +243,7 @@ def parse(self, **kwargs):
235243

236244
return top
237245

238-
def _parseatoms(self):
246+
def _parseatoms(self, **kwargs):
239247
"""Create the initial Topology object"""
240248
resid_prev = 0 # resid looping hack
241249

@@ -325,6 +333,12 @@ def _parseatoms(self):
325333
"found in the PDB file.")
326334
segids = chainids
327335

336+
# If force_chainids_to_segids is set, use chainids as segids
337+
if kwargs.get("force_chainids_to_segids", False):
338+
logger.info("force_chainids_to_segids is set. "
339+
"Using chain IDs as segment IDs.")
340+
segids = chainids
341+
328342
n_atoms = len(serials)
329343

330344
attrs = []
@@ -403,11 +417,13 @@ def _parseatoms(self):
403417
n_residues = len(resids)
404418
attrs.append(Resnums(resnums))
405419
attrs.append(Resids(resids))
406-
attrs.append(Resnums(resids.copy()))
407420
attrs.append(ICodes(icodes))
408421
attrs.append(Resnames(resnames))
409422

410-
if any(segids) and not any(val is None for val in segids):
423+
if (
424+
kwargs.get("force_chainids_to_segids", False) or
425+
(any(segids) and not any(val is None for val in segids))
426+
):
411427
segidx, (segids,) = change_squash((segids,), (segids,))
412428
n_segments = len(segids)
413429
attrs.append(Segids(segids))

0 commit comments

Comments
 (0)