Skip to content

Commit

Permalink
Addressing Bug astropy#543.
Browse files Browse the repository at this point in the history
To do that, we split the original iter_datalinks in the two cases; one
where the datalinks come from a table via a datalink meta RESOURCE, and the
other where we believe there's datalink products in the table.

In the second case (the one that was broken before), we try the name access_format as attribute and column name, and then a utype and two UCDs
(generic and SIA1) each.  We could add SSAP UCDs, but I'll only do that
if someone is asking for it.
  • Loading branch information
msdemlei committed Sep 17, 2024
1 parent e33f9a9 commit c206cbe
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 46 deletions.
4 changes: 3 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,12 @@ Enhancements and Fixes
- Tables returned by RegistryResource.get_tables() now have a utype
attribute [#576]

- iter_metadata() no longer crashes on tables with a datalink RESOURCE
and without obscore attributes [#599]

Deprecations and Removals
-------------------------


- SodaRecordMixin no longer will use a datalink#links endpoint for soda [#580]

- Deprecating the use of "image" and "spectrum" in registry Servicetype
Expand Down
152 changes: 109 additions & 43 deletions pyvo/dal/adhoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _get_accessurl_from_params(params):

class AdhocServiceResultsMixin:
"""
Mixin for adhoc:service functionallity for results classes.
Mixin for adhoc:service functionality for results classes.
"""

def __init__(self, votable, *, url=None, session=None):
Expand Down Expand Up @@ -169,6 +169,109 @@ class DatalinkResultsMixin(AdhocServiceResultsMixin):
"""
Mixin for datalink functionallity for results classes.
"""
def iter_datalinks_from_dlblock(self, datalink_service):
"""yields datalinks from the current rows using a datalink
service RESOURCE.
"""
remaining_ids = [] # remaining IDs to processed
current_batch = None # retrieved but not returned yet
current_ids = [] # retrieved but not returned
processed_ids = [] # retrived and returned IDs
batch_size = None # size of the batch

for row in self:
if not current_ids:
if batch_size is None:
# first call.
self.query = DatalinkQuery.from_resource(
[_ for _ in self],
self._datalink,
session=self._session,
original_row=row)
remaining_ids = self.query['ID']
if not remaining_ids:
# we are done
return
if batch_size:
# subsequent calls are limitted to batch size
self.query['ID'] = remaining_ids[:batch_size]
current_batch = self.query.execute(post=True)
current_ids = list(OrderedDict.fromkeys(
[_ for _ in current_batch.to_table()['ID']]))
if not current_ids:
raise DALServiceError(
'Could not retrieve datalinks for: {}'.format(
', '.join([_ for _ in remaining_ids])))
batch_size = len(current_ids)
id1 = current_ids.pop(0)
processed_ids.append(id1)
remaining_ids.remove(id1)
yield current_batch.clone_byid(
id1,
original_row=row)

@staticmethod
def _guess_access_format(row):
"""returns a guess for the format of what __guess_access_url will
return.
This tries a few heuristics based on how obscore or SIA records might
be marked up. If will return None if row does not look as if
it contained an access format. Note that the heuristics are
tried in sequence; for now, we do not define the sequence of
heuristics.
"""
if hasattr(row, "access_format"):
return row.access_format

if "access_format" in row:
return row["access_format"]

access_format = row.getbyutype("obscore:access.format")
if access_format:
return access_format

access_format = row.getbyucd("meta.code.mime")
if access_format:
return access_format

@staticmethod
def _guess_access_url(row):
"""returns a guess for a URI to a data product in row.
This tries a few heuristics based on how obscore or SIA records might
be marked up. If will return None if row does not look as if
it contained a product access url. Note that the heuristics are
tried in sequence; for now, we do not define the sequence of
heuristics.
"""
if hasattr(row, "access_url"):
return row.access_url

if "access_url" in row:
return row["access_url"]

access_url = row.getbyutype("obscore:access.reference")
if access_url:
return access_url

access_url = row.getbyucd("meta.ref.url")
if access_url:
return access_url

def iter_datalinks_from_product_rows(self):
"""yield datalinks from self's rows if they describe datalink-valued
products.
"""
for row in self:
# TODO: we should be more careful about whitespace, case
# and perhaps more parameters in the following comparison
if self._guess_access_format(row) == DATALINK_MIME_TYPE:
access_url = self._guess_access_url(row)
if access_url is not None:
yield DatalinkResults.from_result_url(
access_url,
original_row=row)

def iter_datalinks(self):
"""
Expand All @@ -186,49 +289,12 @@ def iter_datalinks(self):
self._datalink = self.get_adhocservice_by_ivoid(DATALINK_IVOID)
except DALServiceError:
self._datalink = None
remaining_ids = [] # remaining IDs to processed
current_batch = None # retrieved but not returned yet
current_ids = [] # retrieved but not returned
processed_ids = [] # retrived and returned IDs
batch_size = None # size of the batch

for row in self:
if self._datalink:
if not current_ids:
if batch_size is None:
# first call.
self.query = DatalinkQuery.from_resource(
[_ for _ in self],
self._datalink,
session=self._session,
original_row=row)
remaining_ids = self.query['ID']
if not remaining_ids:
# we are done
return
if batch_size:
# subsequent calls are limitted to batch size
self.query['ID'] = remaining_ids[:batch_size]
current_batch = self.query.execute(post=True)
current_ids = list(OrderedDict.fromkeys(
[_ for _ in current_batch.to_table()['ID']]))
if not current_ids:
raise DALServiceError(
'Could not retrieve datalinks for: {}'.format(
', '.join([_ for _ in remaining_ids])))
batch_size = len(current_ids)
id1 = current_ids.pop(0)
processed_ids.append(id1)
remaining_ids.remove(id1)
yield current_batch.clone_byid(
id1,
original_row=row)
elif row.access_format == DATALINK_MIME_TYPE:
yield DatalinkResults.from_result_url(
row.getdataurl(),
original_row=row)
else:
yield None
if self._datalink is None:
yield from self.iter_datalinks_from_product_rows()

else:
yield from self.iter_datalinks_from_dlblock(self._datalink)


class DatalinkRecordMixin:
Expand Down
97 changes: 96 additions & 1 deletion pyvo/dal/tests/test_datalink.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

import pyvo as vo
from pyvo.dal.adhoc import DatalinkResults
from pyvo.utils import vocabularies
from pyvo.dal.sia2 import SIA2Results
from pyvo.dal.tap import TAPResults
from pyvo.utils import testing, vocabularies

from astropy.utils.data import get_pkg_data_contents, get_pkg_data_filename

Expand Down Expand Up @@ -39,6 +41,17 @@ def callback(request, context):
yield matcher


@pytest.fixture()
def datalink_product(mocker):
def callback(request, context):
return get_pkg_data_contents('data/datalink/datalink.xml')

with mocker.register_uri(
'GET', 'http://example.com/datalink.xml', content=callback
) as matcher:
yield matcher


@pytest.fixture()
def obscore_datalink(mocker):
def callback(request, context):
Expand Down Expand Up @@ -198,3 +211,85 @@ def test_all_mixed(self):
assert res[1].endswith("comb_avg.0001.fits.fz?preview=True")
assert res[2].endswith("http://dc.zah.uni-heidelberg.de/wider.dat")
assert res[3].endswith("when-will-it-be-back")


@pytest.mark.filterwarnings("ignore::astropy.io.votable.exceptions.E02")
@pytest.mark.usefixtures('datalink_product', 'datalink_vocabulary')
class TestIterDatalinksProducts:
"""Tests for producing datalinks from tables containing links to
datalink documents.
"""
def test_no_access_format(self):
res = testing.create_dalresults([
{"name": "access_url", "datatype": "char", "arraysize": "*",
"utype": "obscore:access.reference"}],
[("http://foo.bar/baz.jpeg",)],
resultsClass=TAPResults)
assert list(res.iter_datalinks()) == []

def test_obscore_utype(self):
res = testing.create_dalresults([
{"name": "data_product", "datatype": "char", "arraysize": "*",
"utype": "obscore:access.reference"},
{"name": "content_type", "datatype": "char", "arraysize": "*",
"utype": "obscore:access.format"},],
[("http://example.com/datalink.xml",
"application/x-votable+xml;content=datalink")],
resultsClass=TAPResults)
links = list(res.iter_datalinks())
assert len(links) == 1
assert (next(links[0].bysemantics("#this"))["access_url"]
== "http://dc.zah.uni-heidelberg.de/getproduct/flashheros/data/ca90/f0011.mt")

def test_sia2_record(self):
res = testing.create_dalresults([
{"name": "access_url", "datatype": "char", "arraysize": "*",
"utype": "obscore:access.reference"},
{"name": "access_format", "datatype": "char", "arraysize": "*",
"utype": "obscore:access.format"},],
[("http://example.com/datalink.xml",
"application/x-votable+xml;content=datalink")],
resultsClass=SIA2Results)
links = list(res.iter_datalinks())
assert len(links) == 1
assert (next(links[0].bysemantics("#this"))["access_url"]
== "http://dc.zah.uni-heidelberg.de/getproduct/flashheros/data/ca90/f0011.mt")

def test_sia1_record(self):
res = testing.create_dalresults([
{"name": "access_url", "datatype": "char", "arraysize": "*",
"ucd": "VOX:Image_AccessReference"},
{"name": "access_format", "datatype": "char", "arraysize": "*",
"utype": "VOX:Image_Format"},],
[("http://example.com/datalink.xml",
"application/x-votable+xml;content=datalink")],
resultsClass=TAPResults)
links = list(res.iter_datalinks())
assert len(links) == 1
assert (next(links[0].bysemantics("#this"))["access_url"]
== "http://dc.zah.uni-heidelberg.de/getproduct/flashheros/data/ca90/f0011.mt")

def test_generic_record(self):
# The meta.code.mime and meta.ref.url UCDs are perhaps too
# generic. To ensure a somewhat predictable behaviour,
# we at least make sure we pick the first of possibly multiple
# pairs (not that this would preclude arbitrary amounts of
# chaos).
res = testing.create_dalresults([
{"name": "access_url", "datatype": "char", "arraysize": "*",
"ucd": "meta.ref.url"},
{"name": "access_format", "datatype": "char", "arraysize": "*",
"utype": "meta.code.mime"},
{"name": "alt_access_url", "datatype": "char", "arraysize": "*",
"ucd": "meta.ref.url"},
{"name": "alt_access_format", "datatype": "char", "arraysize": "*",
"utype": "meta.code.mime"},],
[("http://example.com/datalink.xml",
"application/x-votable+xml;content=datalink",
"http://example.com/bad-pick.xml",
"application/x-votable+xml;content=datalink",)],
resultsClass=TAPResults)
links = list(res.iter_datalinks())
assert len(links) == 1
assert (next(links[0].bysemantics("#this"))["access_url"]
== "http://dc.zah.uni-heidelberg.de/getproduct/flashheros/data/ca90/f0011.mt")
8 changes: 7 additions & 1 deletion pyvo/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,20 @@
from pyvo.dal import query as dalquery


try:
TABLE_ELEMENT = tree.TableElement
except AttributeError:
TABLE_ELEMENT = tree.Table


def create_votable(field_descs, records):
"""returns a VOTableFile with a a single table containing records,
described by field_descs.
"""
votable = tree.VOTableFile()
resource = tree.Resource(type="results")
votable.resources.append(resource)
table = tree.Table(votable)
table = TABLE_ELEMENT(votable)
resource.tables.append(table)
table.fields.extend(
tree.Field(votable, **desc) for desc in field_descs)
Expand Down

0 comments on commit c206cbe

Please sign in to comment.