Skip to content

Commit cc25c37

Browse files
authoredJul 16, 2024
Merge pull request #3064 from snbianco/ASB-27903-cloud-uris-from-query
Streamlined method to get list of cloud URIs
2 parents 26050ed + a1349a1 commit cc25c37

File tree

7 files changed

+214
-63
lines changed

7 files changed

+214
-63
lines changed
 

‎CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ mast
169169

170170
- Fix bug in ``Catalogs.query_criteria()`` to use ``page`` and ``pagesize`` parameters correctly. [#3065]
171171

172+
- Modify ``mast.Observations.get_cloud_uris`` to also accept query criteria and data product filters. [#3064]
173+
174+
- Increased the speed of ``mast.Observations.get_cloud_uris`` by obtaining multiple
175+
URIs from MAST at once. [#3064]
172176

173177

174178
0.4.7 (2024-03-08)

‎astroquery/mast/cloud.py

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from astropy.utils.console import ProgressBarOrSpinner
1515
from astropy.utils.exceptions import AstropyDeprecationWarning
1616

17-
from ..exceptions import NoResultsWarning, InvalidQueryError
17+
from ..exceptions import NoResultsWarning
1818

1919
from . import utils
2020

@@ -109,32 +109,14 @@ def get_cloud_uri(self, data_product, include_bucket=True, full_url=False):
109109
found in the cloud, None is returned.
110110
"""
111111

112-
s3_client = self.boto3.client('s3', config=self.config)
113-
114-
path = utils.mast_relative_path(data_product["dataURI"])
115-
if path is None:
116-
raise InvalidQueryError("Malformed data uri {}".format(data_product['dataURI']))
112+
uri_list = self.get_cloud_uri_list(data_product, include_bucket=include_bucket, full_url=full_url)
117113

118-
if 'galex' in path:
119-
path = path.lstrip("/mast/")
120-
elif '/ps1/' in path:
121-
path = path.replace("/ps1/", "panstarrs/ps1/public/")
114+
# Making sure we got at least 1 URI from the query above.
115+
if not uri_list or uri_list[0] is None:
116+
warnings.warn("Unable to locate file {}.".format(data_product), NoResultsWarning)
122117
else:
123-
path = path.lstrip("/")
124-
125-
try:
126-
s3_client.head_object(Bucket=self.pubdata_bucket, Key=path)
127-
if include_bucket:
128-
path = "s3://{}/{}".format(self.pubdata_bucket, path)
129-
elif full_url:
130-
path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path)
131-
return path
132-
except self.botocore.exceptions.ClientError as e:
133-
if e.response['Error']['Code'] != "404":
134-
raise
135-
136-
warnings.warn("Unable to locate file {}.".format(data_product['productFilename']), NoResultsWarning)
137-
return None
118+
# Output from ``get_cloud_uri_list`` is always a list even when it's only 1 URI
119+
return uri_list[0]
138120

139121
def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False):
140122
"""
@@ -158,8 +140,33 @@ def get_cloud_uri_list(self, data_products, include_bucket=True, full_url=False)
158140
List of URIs generated from the data products, list way contain entries that are None
159141
if data_products includes products not found in the cloud.
160142
"""
143+
s3_client = self.boto3.client('s3', config=self.config)
161144

162-
return [self.get_cloud_uri(product, include_bucket, full_url) for product in data_products]
145+
paths = utils.mast_relative_path(data_products["dataURI"])
146+
if isinstance(paths, str): # Handle the case where only one product was requested
147+
paths = [paths]
148+
149+
uri_list = []
150+
for path in paths:
151+
if path is None:
152+
uri_list.append(None)
153+
else:
154+
try:
155+
# Use `head_object` to verify that the product is available on S3 (not all products are)
156+
s3_client.head_object(Bucket=self.pubdata_bucket, Key=path)
157+
if include_bucket:
158+
s3_path = "s3://{}/{}".format(self.pubdata_bucket, path)
159+
uri_list.append(s3_path)
160+
elif full_url:
161+
path = "http://s3.amazonaws.com/{}/{}".format(self.pubdata_bucket, path)
162+
uri_list.append(path)
163+
except self.botocore.exceptions.ClientError as e:
164+
if e.response['Error']['Code'] != "404":
165+
raise
166+
warnings.warn("Unable to locate file {}.".format(path), NoResultsWarning)
167+
uri_list.append(None)
168+
169+
return uri_list
163170

164171
def download_file(self, data_product, local_path, cache=True, verbose=True):
165172
"""

‎astroquery/mast/observations.py

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -774,26 +774,56 @@ def download_products(self, products, *, download_dir=None, flat=False,
774774

775775
return manifest
776776

777-
def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False):
777+
def get_cloud_uris(self, data_products=None, *, include_bucket=True, full_url=False, pagesize=None, page=None,
778+
mrp_only=False, extension=None, filter_products={}, **criteria):
778779
"""
779-
Takes an `~astropy.table.Table` of data products and returns the associated cloud data uris.
780+
Given an `~astropy.table.Table` of data products or query criteria and filter parameters,
781+
returns the associated cloud data URIs.
780782
781783
Parameters
782784
----------
783785
data_products : `~astropy.table.Table`
784-
Table containing products to be converted into cloud data uris.
786+
Table containing products to be converted into cloud data uris. If provided, this will supercede
787+
page_size, page, or any keyword arguments passed in as criteria.
785788
include_bucket : bool
786-
Default True. When false returns the path of the file relative to the
789+
Default True. When False, returns the path of the file relative to the
787790
top level cloud storage location.
788791
Must be set to False when using the full_url argument.
789792
full_url : bool
790793
Default False. Return an HTTP fetchable url instead of a cloud uri.
791794
Must set include_bucket to False to use this option.
795+
pagesize : int, optional
796+
Default None. Can be used to override the default pagesize when making a query.
797+
E.g. when using a slow internet connection. Query criteria must also be provided.
798+
page : int, optional
799+
Default None. Can be used to override the default behavior of all results being returned for a query
800+
to obtain one specific page of results. Query criteria must also be provided.
801+
mrp_only : bool, optional
802+
Default False. When set to True, only "Minimum Recommended Products" will be returned.
803+
extension : string or array, optional
804+
Default None. Option to filter by file extension.
805+
filter_products : dict, optional
806+
Filters to be applied to data products. Valid filters are all products fields listed
807+
`here <https://masttest.stsci.edu/api/v0/_productsfields.html>`__.
808+
The column name as a string is the key. The corresponding value is one
809+
or more acceptable values for that parameter.
810+
Filter behavior is AND between the filters and OR within a filter set.
811+
For example: {"productType": "SCIENCE", "extension"=["fits","jpg"]}
812+
**criteria
813+
Criteria to apply. At least one non-positional criteria must be supplied.
814+
Valid criteria are coordinates, objectname, radius (as in `query_region` and `query_object`),
815+
and all observation fields returned by the ``get_metadata("observations")``.
816+
The Column Name is the keyword, with the argument being one or more acceptable values for that parameter,
817+
except for fields with a float datatype where the argument should be in the form [minVal, maxVal].
818+
For non-float type criteria wildcards maybe used (both * and % are considered wildcards), however
819+
only one wildcarded value can be processed per criterion.
820+
RA and Dec must be given in decimal degrees, and datetimes in MJD.
821+
For example: filters=["FUV","NUV"],proposal_pi="Ost*",t_max=[52264.4586,54452.8914]
792822
793823
Returns
794824
-------
795825
response : list
796-
List of URIs generated from the data products, list way contain entries that are None
826+
List of URIs generated from the data products. May contain entries that are None
797827
if data_products includes products not found in the cloud.
798828
"""
799829

@@ -802,6 +832,29 @@ def get_cloud_uris(self, data_products, *, include_bucket=True, full_url=False):
802832
'Please enable anonymous cloud access by calling `enable_cloud_dataset` method. '
803833
'Refer to `~astroquery.mast.ObservationsClass.enable_cloud_dataset` documentation for more info.')
804834

835+
if data_products is None:
836+
if not criteria:
837+
raise InvalidQueryError(
838+
'Please provide either a `~astropy.table.Table` of data products or query criteria.'
839+
)
840+
else:
841+
# Get table of observations based on query criteria
842+
obs = self.query_criteria(pagesize=pagesize, page=page, **criteria)
843+
844+
if not len(obs):
845+
# Warning raised by ~astroquery.mast.ObservationsClass.query_criteria
846+
return
847+
848+
# Return list of associated data products
849+
data_products = self.get_product_list(obs)
850+
851+
# Filter product list
852+
data_products = self.filter_products(data_products, mrp_only=mrp_only, extension=extension, **filter_products)
853+
854+
if not len(data_products):
855+
warnings.warn("No matching products to fetch associated cloud URIs.", NoResultsWarning)
856+
return
857+
805858
# Remove duplicate products
806859
data_products = self._remove_duplicate_products(data_products)
807860

‎astroquery/mast/tests/test_mast_remote.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -526,12 +526,13 @@ def test_get_cloud_uri(self, test_data_uri, expected_cloud_uri):
526526
assert len(uri) > 0, f'Product for dataURI {test_data_uri} was not found in the cloud.'
527527
assert uri == expected_cloud_uri, f'Cloud URI does not match expected. ({uri} != {expected_cloud_uri})'
528528

529-
def test_get_cloud_uris(self):
529+
@pytest.mark.parametrize("test_obs_id", ["25568122", "31411"])
530+
def test_get_cloud_uris(self, test_obs_id):
530531
pytest.importorskip("boto3")
531-
test_obs_id = '25568122'
532532

533533
# get a product list
534-
products = Observations.get_product_list(test_obs_id)[24:]
534+
index = 24 if test_obs_id == '25568122' else 0
535+
products = Observations.get_product_list(test_obs_id)[index:]
535536

536537
assert len(products) > 0, (f'No products found for OBSID {test_obs_id}. '
537538
'Unable to move forward with getting URIs from the cloud.')
@@ -544,6 +545,36 @@ def test_get_cloud_uris(self):
544545

545546
assert len(uris) > 0, f'Products for OBSID {test_obs_id} were not found in the cloud.'
546547

548+
# check for warning if no data products match filters
549+
with pytest.warns(NoResultsWarning):
550+
Observations.get_cloud_uris(products,
551+
extension='png')
552+
553+
def test_get_cloud_uris_query(self):
554+
pytest.importorskip("boto3")
555+
556+
# enable access to public AWS S3 bucket
557+
Observations.enable_cloud_dataset()
558+
559+
# get uris with other functions
560+
obs = Observations.query_criteria(target_name=234295610)
561+
prod = Observations.get_product_list(obs)
562+
filt = Observations.filter_products(prod, calib_level=[2])
563+
s3_uris = Observations.get_cloud_uris(filt)
564+
565+
# get uris with streamlined function
566+
uris = Observations.get_cloud_uris(target_name=234295610,
567+
filter_products={'calib_level': [2]})
568+
assert s3_uris == uris
569+
570+
# check that InvalidQueryError is thrown if neither data_products or **criteria are defined
571+
with pytest.raises(InvalidQueryError):
572+
Observations.get_cloud_uris(filter_products={'calib_level': [2]})
573+
574+
# check for warning if query returns no observations
575+
with pytest.warns(NoResultsWarning):
576+
Observations.get_cloud_uris(target_name=234295611)
577+
547578
######################
548579
# CatalogClass tests #
549580
######################

‎astroquery/mast/utils.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -158,22 +158,54 @@ def parse_input_location(coordinates=None, objectname=None):
158158

159159
def mast_relative_path(mast_uri):
160160
"""
161-
Given a MAST dataURI, return the associated relative path.
161+
Given one or more MAST dataURI(s), return the associated relative path(s).
162162
163163
Parameters
164164
----------
165-
mast_uri : str
166-
The MAST uri.
165+
mast_uri : str, list of str
166+
The MAST uri(s).
167167
168168
Returns
169169
-------
170-
response : str
171-
The associated relative path.
170+
response : str, list of str
171+
The associated relative path(s).
172172
"""
173-
174-
response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/",
175-
{"uri": mast_uri})
176-
result = response.json()
177-
uri_result = result.get(mast_uri)
178-
179-
return uri_result["path"]
173+
if isinstance(mast_uri, str):
174+
uri_list = [("uri", mast_uri)]
175+
else: # mast_uri parameter is a list
176+
uri_list = [("uri", uri) for uri in mast_uri]
177+
178+
# Split the list into chunks of 50 URIs; this is necessary
179+
# to avoid "414 Client Error: Request-URI Too Large".
180+
uri_list_chunks = list(_split_list_into_chunks(uri_list, chunk_size=50))
181+
182+
result = []
183+
for chunk in uri_list_chunks:
184+
response = _simple_request("https://mast.stsci.edu/api/v0.1/path_lookup/",
185+
{"uri": chunk})
186+
json_response = response.json()
187+
188+
for uri in chunk:
189+
# Chunk is a list of tuples where the tuple is
190+
# ("uri", "/path/to/product")
191+
# so we index for path (index=1)
192+
path = json_response.get(uri[1])["path"]
193+
if 'galex' in path:
194+
path = path.lstrip("/mast/")
195+
elif '/ps1/' in path:
196+
path = path.replace("/ps1/", "panstarrs/ps1/public/")
197+
else:
198+
path = path.lstrip("/")
199+
result.append(path)
200+
201+
# If the input was a single URI string, we return a single string
202+
if isinstance(mast_uri, str):
203+
return result[0]
204+
# Else, return a list of paths
205+
return result
206+
207+
208+
def _split_list_into_chunks(input_list, chunk_size):
209+
"""Helper function for `mast_relative_path`."""
210+
for idx in range(0, len(input_list), chunk_size):
211+
yield input_list[idx:idx + chunk_size]

‎docs/mast/mast_catalog.rst

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@ The returned fields vary by catalog, find the field documentation for specific c
2424
`here <https://mast.stsci.edu/api/v0/pages.html>`__.
2525
If no catalog is specified, the Hubble Source Catalog will be queried.
2626

27+
2728
.. doctest-remote-data::
2829

2930
>>> from astroquery.mast import Catalogs
3031
...
3132
>>> catalog_data = Catalogs.query_object("158.47924 -7.30962", catalog="Galex")
32-
>>> print(catalog_data[:10])
33+
>>> print(catalog_data[:10]) # doctest: +IGNORE_OUTPUT
3334
distance_arcmin objID survey ... fuv_flux_aper_7 fuv_artifact
3435
------------------ ------------------- ------ ... --------------- ------------
3536
0.3493802506329695 6382034098673685038 AIS ... 0.047751952 0
@@ -261,19 +262,17 @@ Given an HSC Match ID, return all catalog results.
261262
>>> catalog_data = Catalogs.query_object("M10", radius=.02, catalog="HSC")
262263
>>> matchid = catalog_data[0]["MatchID"]
263264
>>> print(matchid)
264-
63980492
265+
7542452
265266
>>> matches = Catalogs.query_hsc_matchid(matchid)
266267
>>> print(matches)
267-
CatID MatchID ... cd_matrix
268-
--------- -------- ... ------------------------------------------------------
269-
257195287 63980492 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
270-
257440119 63980492 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
271-
428373428 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
272-
428373427 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
273-
428373429 63980492 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
274-
410574499 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
275-
410574498 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
276-
410574497 63980492 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
268+
CatID MatchID ... cd_matrix
269+
--------- ------- ... ------------------------------------------------------
270+
419094794 7542452 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
271+
419094795 7542452 ... -1.10056e-005 5.65193e-010 5.65193e-010 1.10056e-005
272+
401289578 7542452 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
273+
401289577 7542452 ... -1.10056e-005 1.56577e-009 1.56577e-009 1.10056e-005
274+
257194049 7542452 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
275+
257438887 7542452 ... -1.38889e-005 -5.26157e-010 -5.26157e-010 1.38889e-005
277276

278277

279278
HSC spectra accessed through this class as well. `~astroquery.mast.CatalogsClass.get_hsc_spectra`

0 commit comments

Comments
 (0)