././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8817263 extra_data-1.20.0/0000755000175100001660000000000014757376476013343 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/.coveragerc0000644000175100001660000000016714757376472015464 0ustar00runnerdocker[run] omit = */tests/* concurrency = multiprocessing [paths] source = extra_data/ */site-packages/extra_data/ ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8797262 extra_data-1.20.0/EXtra_data.egg-info/0000755000175100001660000000000014757376476017051 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504381.0 extra_data-1.20.0/EXtra_data.egg-info/PKG-INFO0000644000175100001660000000664714757376475020162 0ustar00runnerdockerMetadata-Version: 2.2 Name: EXtra-data Version: 1.20.0 Summary: Tools to read and analyse data from European XFEL Author: European XFEL GmbH Author-email: da-support@xfel.eu Maintainer: Thomas Michelat License: BSD-3-Clause Project-URL: Documentation, https://extra-data.readthedocs.io/en/latest/ Project-URL: Release notes, https://extra-data.readthedocs.io/en/latest/changelog.html Project-URL: Issues, https://github.com/European-XFEL/EXtra-data/issues Project-URL: Source, https://github.com/European-XFEL/EXtra-data Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: BSD License Classifier: Operating System :: POSIX :: Linux Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Scientific/Engineering :: Physics Requires-Python: >=3.10 Description-Content-Type: text/markdown License-File: LICENSE Requires-Dist: h5py>=2.10 Requires-Dist: matplotlib Requires-Dist: numpy Requires-Dist: packaging Requires-Dist: pandas Requires-Dist: xarray Requires-Dist: pyyaml Provides-Extra: bridge Requires-Dist: karabo-bridge>=0.6; extra == "bridge" Requires-Dist: psutil; extra == "bridge" Provides-Extra: complete Requires-Dist: dask[array]; extra == "complete" Requires-Dist: extra_data[bridge]; extra == "complete" Requires-Dist: tomli; python_version < "3.11" and extra == "complete" Provides-Extra: docs Requires-Dist: extra_data[bridge]; extra == "docs" Requires-Dist: ipython; extra == "docs" Requires-Dist: nbsphinx; extra == "docs" Requires-Dist: sphinx; extra == "docs" Requires-Dist: sphinxcontrib_github_alt; extra == "docs" Provides-Extra: test Requires-Dist: cloudpickle; extra == "test" Requires-Dist: coverage; extra == "test" Requires-Dist: extra_data[complete]; extra == "test" Requires-Dist: nbval; extra == "test" Requires-Dist: pytest; extra == "test" Requires-Dist: pytest-cov; extra == "test" Requires-Dist: testpath; extra == "test" Dynamic: author Dynamic: author-email Dynamic: classifier Dynamic: description Dynamic: description-content-type Dynamic: license Dynamic: maintainer Dynamic: project-url Dynamic: provides-extra Dynamic: requires-dist Dynamic: requires-python Dynamic: summary [![Build Status](https://github.com/European-XFEL/EXtra-data/workflows/Tests/badge.svg)](https://github.com/European-XFEL/EXtra-data/actions?query=workflow%3ATests) [![codecov](https://codecov.io/gh/European-XFEL/EXtra-data/branch/master/graph/badge.svg)](https://codecov.io/gh/European-XFEL/EXtra-data) Python 3 tools for reading European XFEL's HDF5 files. [EXtra-data documentation](https://extra-data.readthedocs.io/en/latest/) See also: [European XFEL data analysis documentation](https://rtd.xfel.eu/docs/data-analysis-user-documentation/en/latest/index.html) Installing ========== *EXtra-data* is available on our Anaconda installation on the Maxwell cluster: module load exfel exfel_anaconda3 You can also install it [from PyPI](https://pypi.org/project/extra-data/) to use in other environments with Python 3.6 or later: pip install extra_data If you get a permissions error, add the `--user` flag to that command. Contributing =========== Tests ----- Tests can be run as follows: python3 -m pytest -v --pyargs extra_data In the source directory, you can also omit `--pyargs extra_data`. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504381.0 extra_data-1.20.0/EXtra_data.egg-info/SOURCES.txt0000644000175100001660000000460114757376475020735 0ustar00runnerdocker.coveragerc LICENSE MANIFEST.in README.md pytest.ini setup.py EXtra_data.egg-info/PKG-INFO EXtra_data.egg-info/SOURCES.txt EXtra_data.egg-info/dependency_links.txt EXtra_data.egg-info/entry_points.txt EXtra_data.egg-info/requires.txt EXtra_data.egg-info/top_level.txt extra_data/__init__.py extra_data/aliases.py extra_data/components.py extra_data/copy.py extra_data/exceptions.py extra_data/export.py extra_data/file_access.py extra_data/keydata.py extra_data/locality.py extra_data/lsxfel.py extra_data/read_machinery.py extra_data/reader.py extra_data/run_files_map.py extra_data/sourcedata.py extra_data/stacking.py extra_data/utils.py extra_data/validation.py extra_data/voview.py extra_data/write_cxi.py extra_data/writer.py extra_data/cli/__init__.py extra_data/cli/make_virtual_cxi.py extra_data/cli/serve_files.py extra_data/cli/serve_run.py extra_data/tests/__init__.py extra_data/tests/conftest.py extra_data/tests/make_examples.py extra_data/tests/test_aliases.py extra_data/tests/test_bad_trains.py extra_data/tests/test_components.py extra_data/tests/test_copy.py extra_data/tests/test_file_access.py extra_data/tests/test_keydata.py extra_data/tests/test_lsxfel.py extra_data/tests/test_open_file_limiter.py extra_data/tests/test_open_run.py extra_data/tests/test_read_machinery.py extra_data/tests/test_reader_mockdata.py extra_data/tests/test_run_files_map.py extra_data/tests/test_slice_objs.py extra_data/tests/test_sourcedata.py extra_data/tests/test_stacking.py extra_data/tests/test_streamer.py extra_data/tests/test_validation.py extra_data/tests/test_voview.py extra_data/tests/test_writer.py extra_data/tests/cli/__init__.py extra_data/tests/cli/test_make_virtual_cxi.py extra_data/tests/mockdata/__init__.py extra_data/tests/mockdata/adc.py extra_data/tests/mockdata/agipd.py extra_data/tests/mockdata/base.py extra_data/tests/mockdata/basler_camera.py extra_data/tests/mockdata/control_common.py extra_data/tests/mockdata/dctrl.py extra_data/tests/mockdata/detectors.py extra_data/tests/mockdata/gauge.py extra_data/tests/mockdata/gec_camera.py extra_data/tests/mockdata/imgfel.py extra_data/tests/mockdata/jungfrau.py extra_data/tests/mockdata/mkfile.py extra_data/tests/mockdata/motor.py extra_data/tests/mockdata/mpod.py extra_data/tests/mockdata/proc.py extra_data/tests/mockdata/sidemic_camera.py extra_data/tests/mockdata/tsens.py extra_data/tests/mockdata/uvlamp.py extra_data/tests/mockdata/xgm.py././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504381.0 extra_data-1.20.0/EXtra_data.egg-info/dependency_links.txt0000644000175100001660000000000114757376475023116 0ustar00runnerdocker ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504381.0 extra_data-1.20.0/EXtra_data.egg-info/entry_points.txt0000644000175100001660000000051114757376475022343 0ustar00runnerdocker[console_scripts] extra-data-locality = extra_data.locality:main extra-data-make-virtual-cxi = extra_data.cli.make_virtual_cxi:main extra-data-validate = extra_data.validation:main karabo-bridge-serve-files = extra_data.cli.serve_files:main karabo-bridge-serve-run = extra_data.cli.serve_run:main lsxfel = extra_data.lsxfel:main ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504381.0 extra_data-1.20.0/EXtra_data.egg-info/requires.txt0000644000175100001660000000052314757376475021450 0ustar00runnerdockerh5py>=2.10 matplotlib numpy packaging pandas xarray pyyaml [bridge] karabo-bridge>=0.6 psutil [complete] dask[array] extra_data[bridge] [complete:python_version < "3.11"] tomli [docs] extra_data[bridge] ipython nbsphinx sphinx sphinxcontrib_github_alt [test] cloudpickle coverage extra_data[complete] nbval pytest pytest-cov testpath ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504381.0 extra_data-1.20.0/EXtra_data.egg-info/top_level.txt0000644000175100001660000000001314757376475021574 0ustar00runnerdockerextra_data ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/LICENSE0000644000175100001660000000301414757376472014342 0ustar00runnerdockerBSD 3-Clause License Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/MANIFEST.in0000644000175100001660000000011114757376472015066 0ustar00runnerdockerinclude LICENSE include README.md include .coveragerc include pytest.ini ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8817263 extra_data-1.20.0/PKG-INFO0000644000175100001660000000664714757376476014455 0ustar00runnerdockerMetadata-Version: 2.2 Name: EXtra-data Version: 1.20.0 Summary: Tools to read and analyse data from European XFEL Author: European XFEL GmbH Author-email: da-support@xfel.eu Maintainer: Thomas Michelat License: BSD-3-Clause Project-URL: Documentation, https://extra-data.readthedocs.io/en/latest/ Project-URL: Release notes, https://extra-data.readthedocs.io/en/latest/changelog.html Project-URL: Issues, https://github.com/European-XFEL/EXtra-data/issues Project-URL: Source, https://github.com/European-XFEL/EXtra-data Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: BSD License Classifier: Operating System :: POSIX :: Linux Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Scientific/Engineering :: Physics Requires-Python: >=3.10 Description-Content-Type: text/markdown License-File: LICENSE Requires-Dist: h5py>=2.10 Requires-Dist: matplotlib Requires-Dist: numpy Requires-Dist: packaging Requires-Dist: pandas Requires-Dist: xarray Requires-Dist: pyyaml Provides-Extra: bridge Requires-Dist: karabo-bridge>=0.6; extra == "bridge" Requires-Dist: psutil; extra == "bridge" Provides-Extra: complete Requires-Dist: dask[array]; extra == "complete" Requires-Dist: extra_data[bridge]; extra == "complete" Requires-Dist: tomli; python_version < "3.11" and extra == "complete" Provides-Extra: docs Requires-Dist: extra_data[bridge]; extra == "docs" Requires-Dist: ipython; extra == "docs" Requires-Dist: nbsphinx; extra == "docs" Requires-Dist: sphinx; extra == "docs" Requires-Dist: sphinxcontrib_github_alt; extra == "docs" Provides-Extra: test Requires-Dist: cloudpickle; extra == "test" Requires-Dist: coverage; extra == "test" Requires-Dist: extra_data[complete]; extra == "test" Requires-Dist: nbval; extra == "test" Requires-Dist: pytest; extra == "test" Requires-Dist: pytest-cov; extra == "test" Requires-Dist: testpath; extra == "test" Dynamic: author Dynamic: author-email Dynamic: classifier Dynamic: description Dynamic: description-content-type Dynamic: license Dynamic: maintainer Dynamic: project-url Dynamic: provides-extra Dynamic: requires-dist Dynamic: requires-python Dynamic: summary [![Build Status](https://github.com/European-XFEL/EXtra-data/workflows/Tests/badge.svg)](https://github.com/European-XFEL/EXtra-data/actions?query=workflow%3ATests) [![codecov](https://codecov.io/gh/European-XFEL/EXtra-data/branch/master/graph/badge.svg)](https://codecov.io/gh/European-XFEL/EXtra-data) Python 3 tools for reading European XFEL's HDF5 files. [EXtra-data documentation](https://extra-data.readthedocs.io/en/latest/) See also: [European XFEL data analysis documentation](https://rtd.xfel.eu/docs/data-analysis-user-documentation/en/latest/index.html) Installing ========== *EXtra-data* is available on our Anaconda installation on the Maxwell cluster: module load exfel exfel_anaconda3 You can also install it [from PyPI](https://pypi.org/project/extra-data/) to use in other environments with Python 3.6 or later: pip install extra_data If you get a permissions error, add the `--user` flag to that command. Contributing =========== Tests ----- Tests can be run as follows: python3 -m pytest -v --pyargs extra_data In the source directory, you can also omit `--pyargs extra_data`. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/README.md0000644000175100001660000000214414757376472014617 0ustar00runnerdocker[![Build Status](https://github.com/European-XFEL/EXtra-data/workflows/Tests/badge.svg)](https://github.com/European-XFEL/EXtra-data/actions?query=workflow%3ATests) [![codecov](https://codecov.io/gh/European-XFEL/EXtra-data/branch/master/graph/badge.svg)](https://codecov.io/gh/European-XFEL/EXtra-data) Python 3 tools for reading European XFEL's HDF5 files. [EXtra-data documentation](https://extra-data.readthedocs.io/en/latest/) See also: [European XFEL data analysis documentation](https://rtd.xfel.eu/docs/data-analysis-user-documentation/en/latest/index.html) Installing ========== *EXtra-data* is available on our Anaconda installation on the Maxwell cluster: module load exfel exfel_anaconda3 You can also install it [from PyPI](https://pypi.org/project/extra-data/) to use in other environments with Python 3.6 or later: pip install extra_data If you get a permissions error, add the `--user` flag to that command. Contributing =========== Tests ----- Tests can be run as follows: python3 -m pytest -v --pyargs extra_data In the source directory, you can also omit `--pyargs extra_data`. ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8707263 extra_data-1.20.0/extra_data/0000755000175100001660000000000014757376476015457 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/__init__.py0000644000175100001660000000403514757376472017566 0ustar00runnerdocker# coding: utf-8 """The extra_data package. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ __version__ = "1.20.0" from .exceptions import ( SourceNameError, PropertyNameError, TrainIDError, AliasError, MultiRunError ) from .keydata import KeyData from .reader import * from .sourcedata import SourceData from .stacking import * from .aliases import AliasIndexer __all__ = reader.__all__ + stacking.__all__ + [ 'SourceData', 'KeyData' ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/aliases.py0000644000175100001660000002115714757376472017454 0ustar00runnerdockerfrom collections import defaultdict from collections.abc import Iterable from .exceptions import AliasError class AliasIndexer: __slots__ = ('data',) def __init__(self, data): self.data = data def _resolve_any_alias(self, alias): alias = alias.lower().replace('_', '-') try: literal = self.data._aliases[alias] except KeyError: raise AliasError(alias) from None return literal def _resolve_source_alias(self, alias): source = self._resolve_any_alias(alias) if isinstance(source, tuple): raise ValueError(f'{alias} not aliasing a source for this data') return source def __getitem__(self, aliased_item): if isinstance(aliased_item, tuple) and len(aliased_item) == 2: # Source alias with key literal. return self.data[self._resolve_source_alias(aliased_item[0]), aliased_item[1]] elif isinstance(aliased_item, str): # Source or key alias. return self.data[self._resolve_any_alias(aliased_item)] raise TypeError('expected alias or (source alias, key) tuple') def _ipython_key_completions_(self): return list(self.data._aliases.keys()) def __contains__(self, aliased_item): try: self[aliased_item] return True except KeyError: return False def __repr__(self): """ Pretty-print all the aliases. """ RED = "\033[91m" END_COLOR = "\033[0m" # Get the right icon for an alias def alias_icon(exists): if isinstance(exists, str): exists = exists in self return " " if exists else f"{RED}✗{END_COLOR}" # Find the alias for a source, if one exists def source_alias(source): for alias, alias_ident in self.data._aliases.items(): if isinstance(alias_ident, str) and source == alias_ident: return alias return None # Group all the aliases by source. The keys of this # dictionary can be either just the source name, or a # tuple of (alias, source). The values are a list of # tuples of (alias, key). source_key_aliases = defaultdict(list) for alias in self.data._aliases.keys(): alias_ident = self.data._aliases[alias] if isinstance(alias_ident, tuple): source = alias_ident[0] if source_alias(source) is not None: dict_key = (source_alias(source), source) else: dict_key = source source_key_aliases[dict_key].append((alias, alias_ident[1])) elif isinstance(alias_ident, str): source_key_aliases[(alias, alias_ident)].extend([]) if len(source_key_aliases) == 0: return "No aliases have been loaded." # Print the aliases output_lines = ["Loaded aliases:"] for source, alias_keys in source_key_aliases.items(): if len(alias_keys) == 0: # If there are no keys then this is a plain source alias alias, source = source output_lines.append(f"{alias_icon(alias)} {alias}: {source}") else: # Check if all the key aliases for the source are valid, # and use that to select an icon for the source keys_exists = [alias in self for alias, _ in alias_keys] if all(keys_exists): source_icon = alias_icon(True) elif not any(keys_exists): source_icon = alias_icon(False) else: source_icon = "~" # Extract the source alias, if it exists if isinstance(source, tuple): source_alias = source[0] source = source[1] else: source_alias = None # If a source has a single key alias, print it on one # line. Otherwise we print the keys indented under the source. if len(alias_keys) == 1: alias, key = alias_keys[0] output_lines.append(f"{alias_icon(alias)} {alias}: ({source}, {key})") else: # If there's an alias, include it in the source header if source_alias is None: source_str = f"{source}" else: source_str = f"{source_alias} ({source})" output_lines.append(f"{source_icon} {source_str}:") for alias, key in alias_keys: output_lines.append(f" {alias_icon(alias)} {alias}: {key}") # Add a newline to the last line added. We can't add a newline by # itself because otherwise it would double up with other newlines # when being joined together at the end output_lines[-1] = output_lines[-1] + "\n" return "\n".join(output_lines) def __str__(self): return f"" def _resolve_aliased_selection(self, selection): if isinstance(selection, dict): res = {self._resolve_source_alias(alias): keys for alias, keys in selection.items()} elif isinstance(selection, Iterable): res = [] for item in selection: if isinstance(item, tuple) and len(item) == 2: # Source alias and literal key. item = (self._resolve_source_alias(item[0]), item[1]) elif isinstance(item, str): item = self._resolve_any_alias(item) if isinstance(item, str): # Source alias. item = (item, '*') res.append(item) return res def select(self, seln_or_alias, key_glob='*', require_all=False, require_any=False): """Select a subset of sources and keys from this data using aliases. This method is only accessible through the :attr:`DataCollection.alias` property. In contrast to :meth:`DataCollection.select`, only a subset of ways to select data via aliases is supported: 1. With a source alias and literal key glob pattern:: # Select all pulse energy keys for an aliased XGM fast data. sel = run.alias.select('sa1-xgm', 'data.intensity*') 2. With an iterable of aliases and/or (source alias, key pattern) tuples:: # Select specific keys for an aliased XGM fast data. sel = run.alias.select([('sa1-xgm', 'data.intensitySa1TD'), ('sa1-xgm', 'data.intensitySa3TD')] # Select several aliases, may be both source and key aliases. sel = run.alias.select(['sa1-xgm', 'mono-hv']) Data is included if it matches any of the aliases. Note that this method does not support glob patterns for the source alias. 3. With a dict of source aliases mapped to sets of key names (or empty sets to get all keys):: # Select image.data from an aliased AGIPD and all data # from an aliased XGM. sel = run.select({'agipd': {'image.data'}, 'sa1-xgm': set()}) The optional `require_all` and `require_any` arguments restrict the trains to those for which all or at least one selected sources and keys have at least one data entry. By default, all trains remain selected. Returns a new :class:`DataCollection` object for the selected data. """ if isinstance(seln_or_alias, str): seln_or_alias = [(seln_or_alias, key_glob)] return self.data.select(self._resolve_aliased_selection( seln_or_alias), require_all=require_all, require_any=require_any) def deselect(self, seln_or_alias, key_glob='*'): """Select everything except the specified sources and keys using aliases. This method is only accessible through the :attr:`DataCollection.alias` property. This takes the same arguments as :meth:`select`, but the sources and keys you specify are dropped from the selection. Returns a new :class:`DataCollection` object for the remaining data. """ if isinstance(seln_or_alias, str): seln_or_alias = [(seln_or_alias, key_glob)] return self.data.deselect(self._resolve_aliased_selection( seln_or_alias)) ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8717263 extra_data-1.20.0/extra_data/cli/0000755000175100001660000000000014757376476016226 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/cli/__init__.py0000644000175100001660000000005114757376472020327 0ustar00runnerdocker"""extra_data command-line interfaces""" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/cli/make_virtual_cxi.py0000644000175100001660000001032014757376472022116 0ustar00runnerdockerimport argparse import logging import os import os.path as osp import re import sys from textwrap import dedent from extra_data import RunDirectory from extra_data.components import identify_multimod_detectors log = logging.getLogger(__name__) def parse_number(number:str): try: return float(number) except ValueError: return int(number, 0) def main(argv=None): example = dedent(""" Example: extra-data-make-virtual-cxi -o ./out_file.h5 --min-modules 15 \\ --fill-value data 0 --fill-value gain 1 /path/to/source/run """) ap = argparse.ArgumentParser( 'extra-data-make-virtual-cxi', epilog=example, formatter_class=argparse.RawDescriptionHelpFormatter, description='Write a virtual CXI file to access the detector data.' ) ap.add_argument('run_dir', help="Path to an EuXFEL run directory") # Specifying a proposal directory & a run number is the older interface. # If the run_number argument is passed, run_dir is used as proposal. ap.add_argument('run_number', nargs="?", help=argparse.SUPPRESS) ap.add_argument( '-o', '--output', help="Filename or path for the CXI output file. " "By default, it is written in the proposal's scratch directory." ) ap.add_argument( '--min-modules', type=int, default=None, metavar='N', help='Include trains where at least N modules have data (default:' ' half+1 of all detector modules).' ) ap.add_argument( '--n-modules', type=int, default=None, metavar='N', help='Number of detector modules in the experiment setup.' ' Should be used only for JUNGFRAU data.' ) ap.add_argument( '--fill-value', action='append', nargs=2, metavar=('DS', 'V'), help='define fill value (V) for individual dataset (DS). Datasets are' ' "data", "gain" and "mask". (defaults: data: nan (proc, float32)' ' or 0 (raw, uint16); gain: 0; mask: 0xffffffff)' ) ap.add_argument( '--exc-suspect-trains', action='store_true', help='Exclude suspect trains. This tries to avoid some issues with' ' incorrect train IDs in the data, but may mean less data is' ' available.' ) args = ap.parse_args(argv) out_file = args.output fill_values = None if args.fill_value: fill_values = {ds: parse_number(value) for ds, value in args.fill_value} logging.basicConfig(level=logging.INFO) if args.run_number: # proposal directory, run number run = 'r%04d' % int(args.run_number) proposal = args.run_dir run_dir = osp.join(args.run_dir, 'proc', run) if out_file is None: out_file = osp.join(proposal, 'scratch', '{}_detectors_virt.cxi'.format(run)) else: # run directory run_dir = os.path.abspath(args.run_dir) if out_file is None: m = re.search(r'/(raw|proc)/(r\d{4})/?$', run_dir) if not m: sys.exit("ERROR: '-o outfile' option needed when " "input directory doesn't look like .../proc/r0123") proposal = run_dir[:m.start()] fname = '{}_{}_detectors_virt.cxi'.format(*m.group(2, 1)) out_file = osp.join(proposal, 'scratch', fname) out_dir = osp.dirname(osp.abspath(out_file)) if not os.access(run_dir, os.R_OK): sys.exit("ERROR: Don't have read access to {}".format(run_dir)) if not os.access(out_dir, os.W_OK): sys.exit("ERROR: Don't have write access to {}".format(out_dir)) log.info("Reading run directory %s", run_dir) inc_suspect = not args.exc_suspect_trains run = RunDirectory(run_dir, inc_suspect_trains=inc_suspect) _, det_class = identify_multimod_detectors(run, single=True) n_modules = det_class.n_modules kwargs = {} if n_modules == 0: n_modules = args.n_modules kwargs['n_modules'] = n_modules min_modules = args.min_modules if min_modules is None: min_modules = 1 if (n_modules is None) else (n_modules // 2) + 1 det = det_class(run, min_modules=min_modules, **kwargs) det.write_virtual_cxi(out_file, fill_values) if __name__ == '__main__': main() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/cli/serve_files.py0000644000175100001660000000360314757376472021104 0ustar00runnerdockerfrom argparse import ArgumentParser import sys IMPORT_FAILED_MSG = """\ {} karabo-bridge-serve-files requires additional dependencies: pip install karabo-bridge psutil """ def main(argv=None): ap = ArgumentParser(prog="karabo-bridge-serve-files") ap.add_argument("path", help="Path of a file or run directory to serve") ap.add_argument("port", help="TCP port or ZMQ endpoint to send data on") ap.add_argument( "--source", help="Stream only matching sources ('*' is a wildcard)", default='*', ) ap.add_argument( "--key", help="Stream only matching keys ('*' is a wildcard)", default='*', ) ap.add_argument( "--append-detector-modules", help="combine multiple module sources" " into one (will only work for AGIPD data currently).", action='store_true' ) ap.add_argument( "--dummy-timestamps", help="create dummy timestamps if the meta-data" " lacks proper timestamps", action='store_true' ) ap.add_argument( "--use-infiniband", help="Use infiniband interface if available " "(if a TCP port is specified)", action='store_true' ) ap.add_argument( "-z", "--socket-type", help="ZeroMQ socket type", choices=['PUB', 'PUSH', 'REP'], default='REP' ) args = ap.parse_args(argv) try: from ..export import serve_files except ImportError as e: sys.exit(IMPORT_FAILED_MSG.format(e)) try: serve_files( args.path, args.port, source_glob=args.source, key_glob=args.key, append_detector_modules=args.append_detector_modules, dummy_timestamps=args.dummy_timestamps, use_infiniband=args.use_infiniband, sock=args.socket_type ) except KeyboardInterrupt: pass print('\nStopped.') if __name__ == '__main__': main() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/cli/serve_run.py0000644000175100001660000000541314757376472020607 0ustar00runnerdockerfrom argparse import ArgumentParser import sys from .. import open_run IMPORT_FAILED_MSG = """\ {} karabo-bridge-serve-run requires additional dependencies: pip install karabo-bridge psutil """ def main(argv=None): ap = ArgumentParser(prog="karabo-bridge-serve-run") ap.add_argument("proposal", help="Proposal number") ap.add_argument("run", help="Run number") ap.add_argument( "--port", default="0", help="TCP port or ZMQ endpoint to send data on. " "Selects a random TCP port by default.") ap.add_argument( "--include", help="Select matching sources (and optionally keys) to " "include in streamed data", action='append' ) ap.add_argument( "--allow-partial", help="Send trains where some sources are missing", action='store_true' ) ap.add_argument( "--append-detector-modules", help="combine multiple module sources" " into one (will only work for AGIPD data currently).", action='store_true' ) ap.add_argument( "--dummy-timestamps", help="create dummy timestamps if the meta-data" " lacks proper timestamps", action='store_true' ) ap.add_argument( "--use-infiniband", help="Use infiniband interface if available " "(if a TCP port is specified)", action='store_true' ) ap.add_argument( "-z", "--socket-type", help="ZeroMQ socket type", choices=['PUB', 'PUSH', 'REP'], default='REP' ) args = ap.parse_args(argv) try: from ..export import serve_data except ImportError as e: sys.exit(IMPORT_FAILED_MSG.format(e)) run = open_run(args.proposal, args.run, data='all') if not args.include: print("Available sources:") for s in sorted(run.all_sources): print(f" {s}") sys.exit("Please select at least one source with --include") include = [] for pat in args.include: if '[' in pat: if not pat.endswith(']'): sys.exit(f"Missing final ] in {pat!r}") src_pat, key_pat = pat[:-1].split('[', 1) include.append((src_pat, key_pat)) else: # Source pattern only include.append(pat) if args.allow_partial: sel = run.select(include, require_any=True) else: sel = run.select(include, require_all=True) try: serve_data( sel, args.port, append_detector_modules=args.append_detector_modules, dummy_timestamps=args.dummy_timestamps, use_infiniband=args.use_infiniband, sock=args.socket_type ) except KeyboardInterrupt: print('\nStopped.') if __name__ == '__main__': main() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/components.py0000644000175100001660000023157714757376472020231 0ustar00runnerdocker"""Interfaces to data from specific instruments """ import logging import math import re from collections.abc import Iterable from copy import copy from warnings import warn import numpy as np import pandas as pd from .exceptions import SourceNameError from .reader import DataCollection, by_id, by_index from .read_machinery import DataChunk, roi_shape, split_trains from .writer import FileWriter from .write_cxi import XtdfCXIWriter, JUNGFRAUCXIWriter __all__ = [ 'AGIPD1M', 'AGIPD500K', 'DSSC1M', 'LPD1M', 'JUNGFRAU', 'identify_multimod_detectors', ] log = logging.getLogger(__name__) MAX_PULSES = 2700 NO_PULSE_ID = 9999 def multimod_detectors(detector_cls): """ Decorator for multimod detector classes (e.g. AGIPD/LPD/JUNGFRAU) to store them in a list 'multimod_detectors.list' and their names in 'multimod_detectors.names'. Parameters ---------- detector_cls: class Decorated detector class to append to the list. Returns ------- detector_cls: class Unmodified decorated detector class. """ multimod_detectors.list = getattr(multimod_detectors, 'list', list()) multimod_detectors.list.append(detector_cls) multimod_detectors.names = getattr(multimod_detectors, 'names', list()) multimod_detectors.names.append(detector_cls.__name__) return detector_cls def _check_pulse_selection(pulses): """Check and normalise a pulse selection""" if not isinstance(pulses, (by_id, by_index)): pulses = by_index[pulses] val = pulses.value if isinstance(pulses.value, slice): # Ensure start/stop/step are all real numbers start = val.start if (val.start is not None) else 0 stop = val.stop if (val.stop is not None) else MAX_PULSES step = val.step if (val.step is not None) else 1 if not all(isinstance(s, int) for s in (start, stop, step)): raise TypeError("Pulse selection slice must use integers or None") if step < 1: raise ValueError("Pulse selection slice must have positive step") if (start < 0) or (stop < 0): raise NotImplementedError("Negative pulse indices not supported") return type(pulses)(slice(start, stop, step)) # Convert everything except slices to numpy arrays elif isinstance(pulses.value, int): val = np.array([val], dtype=np.uint64) else: val = np.asarray(val, dtype=np.uint64) if (val < 0).any(): if isinstance(pulses, by_id): raise ValueError("Pulse IDs cannot be negative") else: raise NotImplementedError("Negative pulse indices not supported") return type(pulses)(val) def _select_pulse_ids(pulses, data_pulse_ids): """Select pulses by ID across a chunk of trains Returns a boolean array of which entries in data_pulse_ids match. """ if isinstance(pulses.value, slice): s = pulses.value desired = np.arange(s.start, s.stop, step=s.step, dtype=np.uint64) else: desired = pulses.value return np.isin(data_pulse_ids, desired) def _out_array(shape, dtype, fill_value=None): if fill_value is None: fill_value = np.nan if dtype.kind == 'f' else 0 fill_value = dtype.type(fill_value) # Zeroed memory can be allocated faster than explicitly writing zeros if fill_value == 0: return np.zeros(shape, dtype=dtype) else: return np.full(shape, fill_value, dtype=dtype) class MultimodDetectorBase: """Base class for detectors made of several modules as separate data sources """ _det_name_pat = r'([^/]+)' _source_raw_pat = r'/DET/(?P\d+)CH' _source_corr_pat = r'/CORR/(?P\d+)CH' # Override in subclass _main_data_key = '' # Key to use for checking data counts match _mask_data_key = '' _frames_per_entry = 1 # Override if separate pulse dimension in files _modnos_start_at = 0 # Override if module numbers start at 1 (JUNGFRAU) module_shape = (0, 0) n_modules = 0 def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1, raw=None): if detector_name is None: detector_name = self._find_detector_name(data) if min_modules <= 0: raise ValueError("min_modules must be a positive integer, not " f"{min_modules!r}") source_to_modno = self._identify_sources(data, detector_name, modules, raw=raw) data = data.select([(src, '*') for src in source_to_modno]) self.detector_name = detector_name self.source_to_modno = source_to_modno # pandas' missing-data handling converts the data to floats if there # are any gaps - so fill them with 0s and convert back to uint64. mod_data_counts = pd.DataFrame({ src: data.get_data_counts(src, self._main_data_key) for src in source_to_modno }).fillna(0).astype(np.uint64) # Within any train, all modules should have same count or zero frame_counts = pd.Series(0, index=mod_data_counts.index, dtype=np.uint64) for tid, data_counts in mod_data_counts.iterrows(): count_vals = set(data_counts) - {0} if len(count_vals) > 1: raise ValueError( f"Inconsistent frame counts for train {tid}: {count_vals}" ) elif count_vals: frame_counts[tid] = count_vals.pop() self.data = self._select_trains(data, mod_data_counts, min_modules) # This should be a reversible 1-to-1 mapping self.modno_to_source = {m: s for (s, m) in source_to_modno.items()} assert len(self.modno_to_source) == len(self.source_to_modno) self.frame_counts = frame_counts[self.data.train_ids] self.train_ids_perframe = np.repeat( self.frame_counts.index.values, self.frame_counts.values.astype(np.intp) ) # If we add extra instance attributes, check whether they should be # updated in .select_trains() below. def __getitem__(self, item): return MultimodKeyData(self, item) def __contains__(self, item): return all(item in self.data[s] for s in self.source_to_modno) def masked_data(self, key=None, *, mask_bits=None, masked_value=np.nan): """Combine corrected data with the mask in the files This provides an interface similar to ``det['data.adc']``, but masking out pixels with the mask from the correction pipeline. Parameters ---------- key: str The data key to look at, by default the main data key of the detector (e.g. 'data.adc'). mask_bits: int or list of ints Reasons to exclude pixels, as a bitmask or a list of integers. By default, all types of bad pixel are masked out. See the possible values at: https://extra.readthedocs.io/en/latest/calibration/#extra.calibration.BadPixels masked_value: int, float The replacement value to use for masked data. By default this is NaN. """ key = key or self._main_data_key if self._mask_data_key not in self: raise RuntimeError( f"This data doesn't include a mask ({self._mask_data_key}). " f"You might be using raw instead of corrected data." ) if isinstance(mask_bits, Iterable): mask_bits = self._combine_bitfield(mask_bits) return DetectorMaskedKeyData( self, key, mask_key=self._mask_data_key, mask_bits=mask_bits, masked_value=masked_value ) @staticmethod def _combine_bitfield(ints): res = 0 for i in ints: res |= i return res @classmethod def _find_detector_names(cls, data): # Find sources matching the pattern (raw or proc) for this detector type raw_re = re.compile(f'(?P{cls._det_name_pat}){cls._source_raw_pat}') corr_re = re.compile(f'(?P{cls._det_name_pat}){cls._source_corr_pat}') detector_names = set() for source in data.instrument_sources: if m := raw_re.match(source) or corr_re.match(source): detector_names.add(m['detname']) return detector_names @classmethod def _find_detector_name(cls, data): detector_names = cls._find_detector_names(data) # We want exactly 1 source if not detector_names: raise SourceNameError(f'{cls._det_name_pat}({cls._source_raw_pat}|{cls._source_corr_pat})') elif len(detector_names) > 1: names_s = ', '.join(repr(n) for n in sorted(detector_names)) raise ValueError( f"Multiple detectors found in the data: {names_s}. " f"Pass detector_name to {cls.__name__}() to pick one." ) return detector_names.pop() @staticmethod def _source_matches(data, pat): source_re = re.compile(pat) for source in data.instrument_sources: m = source_re.match(source) if m: yield source, int(m.group('modno')) @classmethod def _data_is_raw(cls, data, source: str): # For most detectors, raw data is uint16 & corrected is float32. # Overridden for AGIPD, where output dtype is configurable. kd = data[source, cls._main_data_key] return np.issubdtype(kd.dtype, np.integer) @classmethod def _identify_sources(cls, data, detector_name, modules=None, raw=None): if raw is True: pat = re.escape(detector_name) + cls._source_raw_pat source_to_modno = dict(cls._source_matches(data, pat)) if not all(cls._data_is_raw(data, s) for s in source_to_modno): # Older corrected data used the same names as raw raise ValueError( f"Raw data was not found: {detector_name}/DET/... sources " f"are from corrected data" ) else: # Prefer corrected data pat = re.escape(detector_name) + cls._source_corr_pat source_to_modno = dict(cls._source_matches(data, pat)) if not source_to_modno: # Data named like raw may also be proc pat = re.escape(detector_name) + cls._source_raw_pat source_to_modno = dict(cls._source_matches(data, pat)) if (raw is False) and any(cls._data_is_raw(data, s) for s in source_to_modno): raise SourceNameError(f'{detector_name}/CORR/...') # raw=None -> legacy behaviour: prefer corrected but allow raw if modules is not None: source_to_modno = {s: n for (s, n) in source_to_modno.items() if n in modules} if not source_to_modno: dc = '(DET|CORR)' if raw is None else 'DET' if raw else 'CORR' raise SourceNameError(f'{detector_name}/{dc}/...') return source_to_modno @classmethod def _select_trains(cls, data, mod_data_counts, min_modules): modules_present = (mod_data_counts > 0).sum(axis=1) mod_data_counts = mod_data_counts[modules_present >= min_modules] ntrains = len(mod_data_counts) if not ntrains: raise ValueError("No data found with >= {} modules present" .format(min_modules)) log.info("Found %d trains with data for at least %d modules", ntrains, min_modules) train_ids = mod_data_counts.index.values return data.select_trains(by_id[train_ids]) @staticmethod def _split_align_chunk(chunk, target_train_ids: np.ndarray, length_limit=np.inf): """ Split up a source chunk to align with parts of a joined array. Chunk points to contiguous source data, but if this misses a train, it might not correspond to a contiguous region in the output. This yields pairs of (target_slice, source_slice) describing chunks that can be copied/mapped to a similar block in the output. Parameters ---------- chunk: read_machinery::DataChunk Reference to a contiguous chunk of data to be mapped. target_train_ids: numpy.ndarray Train ID index for target array to align chunk data to. Train IDs may occur more than once in here. length_limit: int Maximum length of slices (stop - start) to yield. Larger slices will be split up into several pieces. Unlimited by default. """ # Expand the list of train IDs to one per frame chunk_tids = np.repeat(chunk.train_ids, chunk.counts.astype(np.intp)) chunk_match_start = int(chunk.first) while chunk_tids.size > 0: # Look up where the start of this chunk fits in the target tgt_start = (target_train_ids == chunk_tids[0]).nonzero()[0][0] target_tids = target_train_ids[ tgt_start : tgt_start + len(chunk_tids) ] assert target_tids.shape == chunk_tids.shape, \ f"{target_tids.shape} != {chunk_tids.shape}" assert target_tids[0] == chunk_tids[0], \ f"{target_tids[0]} != {chunk_tids[0]}" # How much of this chunk can be mapped in one go? mismatches = (chunk_tids != target_tids).nonzero()[0] if mismatches.size > 0: n_match = mismatches[0] else: n_match = len(chunk_tids) # Split the matched data if needed for length_limit n_batches = max(math.ceil(n_match / length_limit), 1) for i in range(n_batches): start = i * n_match // n_batches stop = (i + 1) * n_match // n_batches yield (slice(tgt_start + start, tgt_start + stop), slice(chunk_match_start + start, chunk_match_start + stop)) # Prepare remaining data in the chunk for the next match chunk_match_start += n_match chunk_tids = chunk_tids[n_match:] @property def train_ids(self): return self.data.train_ids @property def train_id_chunks(self): # Used to be used internally. Kept temporarily in case anyone else used it. warn( "detector.train_id_chunks is likely to be removed in the future. " "Please contact da-support@xfel.eu if you're using it", stacklevel=2 ) train_id_arr = np.asarray(self.data.train_ids) split_indices = np.where(np.diff(train_id_arr) != 1)[0] + 1 return np.split(train_id_arr, split_indices) @property def train_id_to_ix(self): # Used to be used internally. Kept temporarily in case anyone else used it. warn( "detector.train_id_to_ix is likely to be removed in the future. " "Please contact da-support@xfel.eu if you're using it", stacklevel=2 ) # Cumulative sum gives the end of each train, subtract to get start return self.frame_counts.cumsum() - self.frame_counts @property def frames_per_train(self): counts = set(self.frame_counts.unique()) - {0} if len(counts) > 1: raise ValueError(f"Varying number of frames per train: {counts}") return counts.pop() * self._frames_per_entry def __repr__(self): # Show raw/proc det = type(self).__name__ raw = all(self._data_is_raw(self.data, s) for s in self.source_to_modno) rp = 'raw' if raw else 'proc' return (f"<{det}: Data interface for detector {self.detector_name!r} " f"- {rp} data with {len(self.source_to_modno)} modules>") def select_trains(self, trains): """Select a subset of trains from this data as a new object. Slice trains by position within this data:: sel = det.select_trains(np.s_[:5]) Or select trains by train ID, with a slice or a list:: from extra_data import by_id sel1 = det.select_trains(by_id[142844490 : 142844495]) sel2 = det.select_trains(by_id[[142844490, 142844493, 142844494]]) """ # Using a copy to bypass the source & train checks in __init__ res = copy(self) res.data = self.data.select_trains(trains) res.frame_counts = self.frame_counts[res.data.train_ids] res.train_ids_perframe = np.repeat( res.frame_counts.index.values, res.frame_counts.values.astype(np.intp) ) return res def split_trains(self, parts=None, trains_per_part=None, frames_per_part=None): """Split this data into chunks with a fraction of the trains each. At least one of *parts*, *trains_per_part* or *frames_per_part* must be specified. You can pass any combination of these. Parameters ---------- parts: int How many parts to split the data into. If trains_per_part is also specified, this is a minimum, and it may make more parts. It may also make fewer if there are fewer trains in the data. trains_per_part: int A maximum number of trains in each part. Parts will often have fewer trains than this. frames_per_part: int A target number of frames in each part. Each chunk should have up to this many frames, but chunks always contain complete trains, so if this is less than one train, you may get single train chunks with more frames. When ``frames_per_part`` is used, the final chunk may be much smaller than the others. """ if {parts, trains_per_part, frames_per_part} == {None}: raise ValueError( "One of parts, trains_per_part, frames_per_part must be specified" ) if frames_per_part is None: for s in split_trains(len(self.train_ids), parts, trains_per_part): yield self.select_trains(s) else: # frames_per_part was specified. We don't assume that the number # of frames per train is constant, so we'll iterate over trains # and cut off each chunk when we reach the relevant number. if not self.train_ids: return # No data to split if trains_per_part is None: trains_per_part = np.inf if parts: trains_per_part = min(trains_per_part, len(self.train_ids) // parts) chunk_start = 0 ntrains = 1 nentries = self.frame_counts.iloc[0] for frame_ct in self.frame_counts.iloc[1:]: ntrains += 1 nentries += frame_ct if (ntrains > trains_per_part) or (nentries * self._frames_per_entry > frames_per_part): # We've got a full chunk chunk_end = chunk_start + ntrains - 1 yield self.select_trains(np.s_[chunk_start:chunk_end]) chunk_start = chunk_end ntrains = 1 nentries = frame_ct # There will always be at least the last train left to yield yield self.select_trains(np.s_[chunk_start:]) def get_array(self, key, *, fill_value=None, roi=(), astype=None): """Get a labelled array of detector data Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. roi: tuple Specify e.g. ``np.s_[10:60, 100:200]`` to select pixels within each module when reading data. The selection is applied to each individual module, so it may only be useful when working with a single module. astype: Type Data type of the output array. If None (default) the dtype matches the input array dtype """ return self[key].xarray(fill_value=fill_value, roi=roi, astype=astype) def get_dask_array(self, key, fill_value=None, astype=None): """Get a labelled Dask array of detector data Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. astype: Type Data type of the output array. If None (default) the dtype matches the input array dtype """ return self[key].dask_array(labelled=True, fill_value=fill_value, astype=astype) def trains(self, require_all=True): """Iterate over trains for detector data. Parameters ---------- require_all: bool If True (default), skip trains where any of the selected detector modules are missing data. Yields ------ train_data: dict A dictionary mapping key names (e.g. ``image.data``) to labelled arrays. """ return MPxDetectorTrainIterator(self, require_all=require_all) def data_availability(self, module_gaps=False): """Get an array indicating what image data is available Returns a boolean array (modules, entries), True where a module has data for a given train, False for missing data. """ return self[self._main_data_key].data_availability(module_gaps) class XtdfDetectorBase(MultimodDetectorBase): """Common machinery for a group of detectors with similar data format AGIPD, DSSC & LPD all store pulse-resolved data in an "image" group, with both trains and pulses along the first dimension. This allows a different number of frames to be stored for each train, which makes access more complicated. """ n_modules = 16 _main_data_key = 'image.data' _mask_data_key = 'image.mask' def __getitem__(self, item): if item.startswith('image.'): return XtdfImageMultimodKeyData(self, item) return super().__getitem__(item) def masked_data(self, key=None, *, mask_bits=None, masked_value=np.nan): """Combine corrected data with the mask in the files This provides an interface similar to ``det['image.data']``, but masking out pixels with the mask from the correction pipeline. Parameters ---------- key: str The data key to look at, by default the main data key of the detector (e.g. 'image.data'). mask_bits: int or list of ints Reasons to exclude pixels, as a bitmask or a list of integers. By default, all types of bad pixel are masked out. masked_value: int, float The replacement value to use for masked data. By default this is NaN. """ key = key or self._main_data_key assert key.startswith('image.') if self._mask_data_key not in self: raise RuntimeError( f"This data doesn't include a mask ({self._mask_data_key}). " f"You might be using raw instead of corrected data." ) if isinstance(mask_bits, Iterable): mask_bits = self._combine_bitfield(mask_bits) return XtdfMaskedKeyData( self, key, mask_key=self._mask_data_key, mask_bits=mask_bits, masked_value=masked_value ) # Several methods below are overridden in LPD1M for parallel gain mode @staticmethod def _select_pulse_indices(pulses, counts): """Select pulses by index across a chunk of trains Returns a boolean array of frames to include. """ sel_frames = np.zeros(counts.sum(), dtype=np.bool_) cursor = 0 for count in counts: sel_in_train = pulses.value if isinstance(sel_in_train, np.ndarray): # Ignore any indices after the end of the train sel_in_train = sel_in_train[sel_in_train < count] sel_frames[cursor:cursor + count][sel_in_train] = 1 cursor += count return sel_frames def _make_image_index(self, tids, inner_ids, inner_name='pulse'): """ Prepare indices for data per inner coordinate. Parameters ---------- tids: np.array Train id repeated for each inner coordinate. inner_ids: np.array Array of inner coordinate values. inner_name: string Name of the inner coordinate. Returns ------- pd.MultiIndex MultiIndex of 'train_ids' x 'inner_ids'. """ # Overridden in LPD1M for parallel gain mode return pd.MultiIndex.from_arrays( [tids, inner_ids], names=['train', inner_name] ) def _read_inner_ids(self, field='pulseId'): """Read pulse/cell IDs into a 2D array (frames, modules) Overridden by LPD1M for parallel gain mode. """ inner_ids = np.full(( self.frame_counts.sum(), self.n_modules), NO_PULSE_ID, dtype=np.uint64 ) for source, modno in self.source_to_modno.items(): for chunk in self.data._find_data_chunks(source, 'image.' + field): dset = chunk.dataset unwanted_dim = (dset.ndim > 1) and (dset.shape[1] == 1) for tgt_slice, chunk_slice in self._split_align_chunk( chunk, self.train_ids_perframe ): # Select the matching data and add it to pulse_ids # In some cases, there's an extra dimension of length 1. matched = chunk.dataset[chunk_slice] if unwanted_dim: matched = matched[:, 0] inner_ids[tgt_slice, modno] = matched return inner_ids def _collect_inner_ids(self, field='pulseId'): """ Gather pulse/cell ID labels for all modules and check consistency. Raises ------ Exception: Some data has no pulse ID values for any module. Exception: Inconsistent pulse IDs between detector modules. Returns ------- inner_ids: np.array Array of pulse/cell IDs per frame common for all detector modules. """ inner_ids = self._read_inner_ids(field) # Sanity checks on pulse IDs inner_ids_min: np.ndarray = inner_ids.min(axis=1) if (inner_ids_min == NO_PULSE_ID).any(): raise Exception(f"Failed to find {field} for some data") inner_ids[inner_ids == NO_PULSE_ID] = 0 if (inner_ids_min != inner_ids.max(axis=1)).any(): raise Exception(f"Inconsistent {field} for different modules") # Pulse IDs make sense. Drop the modules dimension, giving one # pulse ID for each frame. return inner_ids_min def get_array(self, key, pulses=np.s_[:], unstack_pulses=True, *, fill_value=None, subtrain_index='pulseId', roi=(), astype=None): """Get a labelled array of detector data Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. pulses: slice, array, by_id or by_index Select the pulses to include from each train. by_id selects by pulse ID, by_index by index within the data being read. The default includes all pulses. Only used for per-pulse data. unstack_pulses: bool Whether to separate train and pulse dimensions. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. subtrain_index: str Specify 'pulseId' (default) or 'cellId' to label the frames recorded within each train. Pulse ID should allow this data to be matched with other devices, but depends on how the detector was manually configured when the data was taken. Cell ID refers to the memory cell used for that frame in the detector hardware. roi: tuple Specify e.g. ``np.s_[10:60, 100:200]`` to select pixels within each module when reading data. The selection is applied to each individual module, so it may only be useful when working with a single module. For AGIPD raw data, each module records a frame as a 3D array with 2 entries on the first dimension, for data & gain information, so ``roi=np.s_[0]`` will select only the data part of each frame. astype: Type data type of the output array. If None (default) the dtype matches the input array dtype """ if subtrain_index not in {'pulseId', 'cellId'}: raise ValueError("subtrain_index must be 'pulseId' or 'cellId'") if not isinstance(roi, tuple): roi = (roi,) if key.startswith('image.'): return self[key].select_pulses(pulses).xarray( fill_value=fill_value, roi=roi, subtrain_index=subtrain_index, astype=astype, unstack_pulses=unstack_pulses, ) else: return super().get_array( key, fill_value=fill_value, roi=roi, astype=astype ) def get_dask_array(self, key, subtrain_index='pulseId', fill_value=None, astype=None): """Get a labelled Dask array of detector data Dask does lazy, parallelised computing, and can work with large data volumes. This method doesn't immediately load the data: that only happens once you trigger a computation. Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. subtrain_index: str, optional Specify 'pulseId' (default) or 'cellId' to label the frames recorded within each train. Pulse ID should allow this data to be matched with other devices, but depends on how the detector was manually configured when the data was taken. Cell ID refers to the memory cell used for that frame in the detector hardware. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. astype: Type, optional data type of the output array. If None (default) the dtype matches the input array dtype """ from xarray import DataArray if subtrain_index not in {'pulseId', 'cellId'}: raise ValueError("subtrain_index must be 'pulseId' or 'cellId'") if key.startswith('image.'): arr = self[key].dask_array( labelled=True, subtrain_index=subtrain_index, fill_value=fill_value, astype=astype ) # Preserve the quirks of this method before refactoring if self[key]._extraneous_dim: arr = arr.expand_dims('tmp_name', axis=2) frame_idx = arr.indexes['train_pulse'].set_names( ['trainId', subtrain_index], level=[0, -1] ) dims = ['module', 'train_pulse'] + [f'dim_{i}' for i in range(arr.ndim - 2)] return DataArray(arr.data, dims=dims, coords={ 'train_pulse': frame_idx, 'module': arr.indexes['module'], }) else: return super().get_dask_array(key, fill_value=fill_value, astype=astype) def trains(self, pulses=np.s_[:], require_all=True): """Iterate over trains for detector data. Parameters ---------- pulses: slice, array, by_index or by_id Select which pulses to include for each train. The default is to include all pulses. require_all: bool If True (default), skip trains where any of the selected detector modules are missing data. Yields ------ train_data: dict A dictionary mapping key names (e.g. ``image.data``) to labelled arrays. """ return MPxDetectorTrainIterator(self, pulses, require_all=require_all) def write_virtual_cxi(self, filename, fillvalues=None): """Write a virtual CXI file to access the detector data. The virtual datasets in the file provide a view of the detector data as if it was a single huge array, but without copying the data. Creating and using virtual datasets requires HDF5 1.10. Parameters ---------- filename: str The file to be written. Will be overwritten if it already exists. fillvalues: dict, optional keys are datasets names (one of: data, gain, mask) and associated fill value for missing data (default is np.nan for float arrays and zero for integer arrays) """ XtdfCXIWriter(self).write(filename, fillvalues=fillvalues) def write_frames(self, filename, trains, pulses): """Write selected detector frames to a new EuXFEL HDF5 file trains and pulses should be 1D arrays of the same length, containing train IDs and pulse IDs (corresponding to the pulse IDs recorded by the detector). i.e. (trains[i], pulses[i]) identifies one frame. """ if (trains.ndim != 1) or (pulses.ndim != 1): raise ValueError("trains & pulses must be 1D arrays") inc_tp_ids = zip_trains_pulses(trains, pulses) writer = FramesFileWriter(filename, self.data, inc_tp_ids) try: writer.write() finally: writer.file.close() def zip_trains_pulses(trains, pulses): """Combine two similar arrays of train & pulse IDs as one struct array """ if trains.shape != pulses.shape: raise ValueError( f"Train & pulse arrays don't match ({trains.shape} != {pulses.shape})" ) res = np.zeros(trains.shape, dtype=np.dtype([ ('trainId', np.uint64), ('pulseId', np.uint64) ])) res['trainId'] = trains res['pulseId'] = pulses return res class MultimodKeyData: def __init__(self, det: MultimodDetectorBase, key): self.det = det self.key = key self.modno_to_keydata = { m: det.data[s, key] for (m, s) in det.modno_to_source.items() } def _init_kwargs(self): # Extended in subclasses return dict(det=self.det, key=self.key) @property def train_ids(self): return self.det.train_ids def train_id_coordinates(self): return np.array(self.det.train_ids) @property def modules(self): return sorted(self.modno_to_keydata) @property def _eg_keydata(self): return self.modno_to_keydata[min(self.modno_to_keydata)] @property def ndim(self): return self._eg_keydata.ndim + 1 def buffer_shape(self, module_gaps=False, roi=()): """Get the array shape for this data If *module_gaps* is True, include space for modules which are missing from the data. *roi* may be a tuple of slices defining a region of interest on the inner dimensions of the data. """ module_dim = self.det.n_modules if module_gaps else len(self.modno_to_keydata) return ((module_dim, len(self.train_ids)) # Shape of 1 frame for 1 module with the ROI applied: + roi_shape(self._eg_keydata.entry_shape, roi)) @property def shape(self): return self.buffer_shape() @property def dimensions(self): return ['module', 'trainId'] + ['dim_%d' % i for i in range(self.ndim - 2)] @property def dtype(self): return self._eg_keydata.dtype # For select_trains() & split_trains() to work correctly with subclasses def _with_selected_det(self, det_selected): kw = self._init_kwargs() kw.update(det=det_selected) return type(self)(**kw) def select_trains(self, trains): return self._with_selected_det(self.det.select_trains(trains)) def __getitem__(self, item): return self.select_trains(item) __iter__ = None # Disable iteration def split_trains(self, parts=None, trains_per_part=None, frames_per_part=None): for det_split in self.det.split_trains(parts, trains_per_part, frames_per_part): yield self._with_selected_det(det_split) def ndarray(self, *, fill_value=None, out=None, roi=(), astype=None, module_gaps=False): """Get data as a plain NumPy array with no labels""" train_ids = np.asarray(self.det.train_ids) out_shape = self.buffer_shape(module_gaps, roi) if out is None: dtype = self._eg_keydata.dtype if astype is None else np.dtype(astype) out = _out_array(out_shape, dtype, fill_value=fill_value) elif out.shape != out_shape: raise ValueError(f'requires output array of shape {out_shape}') for i, (modno, kd) in enumerate(sorted(self.modno_to_keydata.items())): mod_ix = (modno - self.det._modnos_start_at) if module_gaps else i for chunk in kd._data_chunks: for tgt_slice, chunk_slice in self.det._split_align_chunk(chunk, train_ids): chunk.dataset.read_direct( out[mod_ix, tgt_slice], source_sel=(chunk_slice,) + roi ) return out def _wrap_xarray(self, arr): from xarray import DataArray coords = {'module': self.modules, 'trainId': self.train_id_coordinates()} return DataArray(arr, dims=self.dimensions, coords=coords) def xarray(self, *, fill_value=None, roi=(), astype=None): arr = self.ndarray(fill_value=fill_value, roi=roi, astype=astype) return self._wrap_xarray(arr) def dask_array(self, *, labelled=False, fill_value=None, astype=None): from dask.delayed import delayed from dask.array import concatenate, from_delayed entry_size = (self.dtype.itemsize * len(self.modno_to_keydata) * np.prod(self._eg_keydata.entry_shape) ) # Aim for 1GB chunks, with an arbitrary maximum of 256 trains split = self.split_trains(frames_per_part=min(1024 ** 3 / entry_size, 256)) arr = concatenate([from_delayed( delayed(c.ndarray)(fill_value=fill_value, astype=astype), shape=c.shape, dtype=self.dtype ) for c in split], axis=1) if labelled: return self._wrap_xarray(arr) return arr def data_availability(self, module_gaps=False): """Get an array indicating what data is available Returns a boolean array (modules, entries), True where a module has data for a given train, False for missing data. """ train_ids = self.train_id_coordinates() module_dim = self.det.n_modules if module_gaps else len(self.modno_to_keydata) out = np.zeros((module_dim, len(train_ids)), dtype=np.bool_) for i, (modno, kd) in enumerate(sorted(self.modno_to_keydata.items())): mod_ix = (modno - self.det._modnos_start_at) if module_gaps else i for chunk in kd._data_chunks: for tgt_slice, _ in self.det._split_align_chunk(chunk, train_ids): out[mod_ix, tgt_slice] = True return out class DetectorMaskedKeyData(MultimodKeyData): def __init__(self, *args, mask_key, mask_bits, masked_value, **kwargs): super().__init__(*args, **kwargs) self._mask_key = mask_key self._mask_bits = mask_bits self._masked_value = masked_value def __repr__(self): return f"" def _init_kwargs(self): kw = super()._init_kwargs() kw.update( mask_key=self._mask_key, mask_bits=self._mask_bits, masked_value=self._masked_value, ) return kw # Overridden for XTDF data to accommodate pulse selection def _mask_keydata(self): return self.det[self._mask_key] def _load_mask(self, module_gaps): """Load the mask & convert to boolean (True for bad pixels)""" mask_data = self._mask_keydata().ndarray(module_gaps=module_gaps) if self._mask_bits is None: return mask_data != 0 # Skip extra temporary array from & else: return (mask_data & self._mask_bits) != 0 def ndarray(self, *, module_gaps=False, **kwargs): """Load data into a NumPy array & apply the mask""" # Load mask first: it shrinks from 4 bytes/px to 1, so peak memory use # is lower than loading it after the data mask = self._load_mask(module_gaps=module_gaps) data = super().ndarray(module_gaps=module_gaps, **kwargs) data[mask] = self._masked_value return data class XtdfImageMultimodKeyData(MultimodKeyData): _sel_frames_cached = None det: XtdfDetectorBase def __init__(self, det: XtdfDetectorBase, key, pulse_sel=by_index[0:MAX_PULSES:1]): super().__init__(det, key) self._pulse_sel = pulse_sel entry_shape = self._eg_keydata.entry_shape self._extraneous_dim = (len(entry_shape) >= 1) and (entry_shape[0] == 1) def _init_kwargs(self): kw = super()._init_kwargs() kw.update(pulse_sel=self._pulse_sel) return kw @property def ndim(self): return super().ndim - (1 if self._extraneous_dim else 0) def _all_pulses(self): psv = self._pulse_sel.value return isinstance(psv, slice) and psv == slice(0, MAX_PULSES, 1) def buffer_shape(self, module_gaps=False, roi=()): """Get the array shape for this data If *module_gaps* is True, include space for modules which are missing from the data. *roi* may be a tuple of slices defining a region of interest on the inner dimensions of the data. """ module_dim = self.det.n_modules if module_gaps else len(self.modno_to_keydata) # len(self.train_id_coordinates()), but avoids allocating extra arrays if self._all_pulses(): nframes_sel = len(self.det.train_ids_perframe) else: nframes_sel = int(self._sel_frames.sum()) entry_shape = self._eg_keydata.entry_shape if self._extraneous_dim: entry_shape = entry_shape[1:] return (module_dim, nframes_sel) + roi_shape(entry_shape, roi) @property def shape(self): return self.buffer_shape() def train_id_coordinates(self): # XTDF 'image' group can have >1 entry per train a = self.det.train_ids_perframe # Only allocate sel_frames array if we need it: if not self._all_pulses(): a = a[self._sel_frames] else: a = a.copy() # So you can't accidentally modify the internal array return a def pulse_id_coordinates(self): """Get an array of pulse IDs per-frame for this data""" return self.det._collect_inner_ids('pulseId') def cell_id_coordinates(self): """Get an array of memory cell IDs per-frame for this data""" return self.det._collect_inner_ids('cellId') @property def dimensions(self): ndim_inner = self.ndim - 2 # TODO: this assumes we can tell what the axes are just from the # number of dimensions. Works for the data we've seen, but we # should look for a more reliable way. if ndim_inner == 3: # image.data in raw data entry_dims = ['data_gain', 'slow_scan', 'fast_scan'] elif ndim_inner == 2: # image.data, image.gain, image.mask in calibrated data entry_dims = ['slow_scan', 'fast_scan'] else: # Everything else seems to be 1D, but just in case entry_dims = [f'dim_{i}' for i in range(ndim_inner)] return ['module', 'train_pulse'] + entry_dims def select_pulses(self, pulses): kw = self._init_kwargs() kw.update(pulse_sel=_check_pulse_selection(pulses)) return type(self)(**kw) @property def _sel_frames(self): if self._sel_frames_cached is None: p = self._pulse_sel if isinstance(p, by_index): if self._all_pulses(): s = np.ones(len(self.det.train_ids_perframe), np.bool_) else: s = self.det._select_pulse_indices(p, self.det.frame_counts) elif isinstance(p, by_id): pulse_ids = self.det._collect_inner_ids('pulseId') s = _select_pulse_ids(p, pulse_ids) else: raise TypeError(f"Pulse selection should not be {type(p)}") self._sel_frames_cached = s return self._sel_frames_cached def _read_chunk(self, chunk: DataChunk, mod_out, roi): """Read per-pulse data from file into an output array (of 1 module)""" # Limit to 5 GB sections of the dataset at once, so the temporary # arrays used in the workaround below are not too large. nbytes_frame = chunk.dataset.dtype.itemsize for dim in chunk.dataset.shape[1:]: nbytes_frame *= dim frame_limit = 5 * (1024 ** 3) // nbytes_frame for tgt_slice, chunk_slice in self.det._split_align_chunk( chunk, self.det.train_ids_perframe, length_limit=frame_limit ): inc_pulses_chunk = self._sel_frames[tgt_slice] if inc_pulses_chunk.sum() == 0: # No data from this chunk selected continue elif inc_pulses_chunk.all(): # All pulses in chunk chunk.dataset.read_direct( mod_out[tgt_slice], source_sel=(chunk_slice,) + roi ) continue # Read a subset of pulses from the chunk: # Reading a non-contiguous selection in HDF5 seems to be slow: # https://forum.hdfgroup.org/t/performance-reading-data-with-non-contiguous-selection/8979 # Except it's fast if you read the data to a matching selection in # memory (one weird trick). # So as a workaround, this allocates a temporary array of the same # shape as the full chunk, reads into it, and then copies the selected # data to the output array. The extra memory copy is not optimal, # but it's better than the HDF5 performance issue, at least in some # realistic cases. # N.B. tmp should only use memory for the data it contains - # zeros() uses calloc, so the OS can do virtual memory tricks. # Don't change this to zeros_like() ! tmp = np.zeros( shape=inc_pulses_chunk.shape + chunk.dataset.shape[1:], dtype=chunk.dataset.dtype ) tmp_sel = np.nonzero(inc_pulses_chunk)[0] dataset_sel = tmp_sel + chunk_slice.start chunk.dataset.read_direct( tmp, source_sel=(dataset_sel,) + roi, dest_sel=(tmp_sel,) + roi, ) # Where does this data go in the target array? tgt_start_ix = self._sel_frames[:tgt_slice.start].sum() tgt_pulse_sel = slice( tgt_start_ix, tgt_start_ix + inc_pulses_chunk.sum() ) # Copy data from temp array to output array np.compress( inc_pulses_chunk, tmp[np.index_exp[:] + roi], axis=0, out=mod_out[tgt_pulse_sel] ) def ndarray(self, *, fill_value=None, out=None, roi=(), astype=None, module_gaps=False): """Get an array of per-pulse data (image.*) for xtdf detector""" out_shape = self.buffer_shape(module_gaps=module_gaps, roi=roi) if out is None: dtype = self._eg_keydata.dtype if astype is None else np.dtype(astype) out = _out_array(out_shape, dtype, fill_value=fill_value) elif out.shape != out_shape: raise ValueError(f'requires output array of shape {out_shape}') reading_view = out.view() if self._extraneous_dim: reading_view.shape = out.shape[:2] + (1,) + out.shape[2:] # Ensure ROI applies to pixel dimensions, not the extra # dim in raw data (except AGIPD, where it is data/gain) roi = np.index_exp[:] + roi for i, (modno, kd) in enumerate(sorted(self.modno_to_keydata.items())): mod_ix = (modno - self.det._modnos_start_at) if module_gaps else i for chunk in kd._data_chunks: self._read_chunk(chunk, reading_view[mod_ix], roi) return out def _wrap_xarray(self, arr, subtrain_index='pulseId'): from xarray import DataArray inner_ids = self.det._collect_inner_ids(subtrain_index) index = self.det._make_image_index( self.det.train_ids_perframe, inner_ids, subtrain_index[:-2] )[self._sel_frames] return DataArray(arr, dims=self.dimensions, coords={ 'train_pulse': index, 'module': self.modules, }) def xarray(self, *, pulses=None, fill_value=None, roi=(), astype=None, subtrain_index='pulseId', unstack_pulses=False): arr = self.ndarray(fill_value=fill_value, roi=roi, astype=astype) out = self._wrap_xarray(arr, subtrain_index) if unstack_pulses: # Separate train & pulse dimensions, and arrange dimensions # so that the data is contiguous in memory. dim_order = ['module'] + out.indexes['train_pulse'].names + self.dimensions[2:] return out.unstack('train_pulse').transpose(*dim_order) return out def dask_array(self, *, labelled=False, subtrain_index='pulseId', fill_value=None, astype=None, frames_per_chunk=None): from dask.delayed import delayed from dask.array import concatenate, from_delayed entry_size = (self.dtype.itemsize * len(self.modno_to_keydata) * np.prod(self._eg_keydata.entry_shape) ) if frames_per_chunk is None: # Aim for 2GB chunks, with an arbitrary maximum of 1024 frames frames_per_chunk = min(2 * 1024 ** 3 / entry_size, 1024) split = self.split_trains(frames_per_part=frames_per_chunk) arr = concatenate([from_delayed( delayed(c.ndarray)(fill_value=fill_value, astype=astype), shape=c.shape, dtype=self.dtype ) for c in split], axis=1) if labelled: return self._wrap_xarray(arr, subtrain_index) return arr class XtdfMaskedKeyData(DetectorMaskedKeyData, XtdfImageMultimodKeyData): # Created from xtdf_det.masked_data() def _mask_keydata(self): return self.det[self._mask_key].select_pulses(self._pulse_sel) class FramesFileWriter(FileWriter): """Write selected detector frames in European XFEL HDF5 format""" def __init__(self, path, data, inc_tp_ids): super().__init__(path, data) self.inc_tp_ids = inc_tp_ids def _guess_number_of_storing_entries(self, source, key): if source in self.data.instrument_sources and key.startswith("image."): # Start with an empty dataset, grow it as we add each file return 0 else: return super()._guess_number_of_storing_entries(source, key) def copy_image_data(self, source, keys): """Copy selected frames of the detector image data""" frame_tids_piecewise = [] src_files = sorted( self.data[source].files, key=lambda fa: fa.train_ids[0] ) for fa in src_files: _, counts = fa.get_index(source, 'image') file_tids = np.repeat(fa.train_ids, counts.astype(np.intp)) file_pids = fa.file[f'/INSTRUMENT/{source}/image/pulseId'][:] if file_pids.ndim == 2 and file_pids.shape[1] == 1: # Raw data has a spurious extra dimension file_pids = file_pids[:, 0] # Data can have trailing 0s, seemingly file_pids = file_pids[:len(file_tids)] file_tp_ids = zip_trains_pulses(file_tids, file_pids) # indexes of selected frames in datasets under .../image in this file ixs = np.isin(file_tp_ids, self.inc_tp_ids).nonzero()[0] nframes = ixs.shape[0] for key in keys: path = f"INSTRUMENT/{source}/{key.replace('.', '/')}" dst_ds = self.file[path] dst_cursor = dst_ds.shape[0] dst_ds.resize(dst_cursor + nframes, axis=0) dst_ds[dst_cursor: dst_cursor+nframes] = fa.file[path][ixs] frame_tids_piecewise.append(file_tids[ixs]) frame_tids = np.concatenate(frame_tids_piecewise) self._make_index(source, 'image', frame_tids) def copy_source(self, source): """Copy all the relevant data for one detector source""" if source not in self.data.instrument_sources: return super().copy_source(source) all_keys = self.data.keys_for_source(source) img_keys = {k for k in all_keys if k.startswith('image.')} for key in sorted(all_keys - img_keys): self.copy_dataset(source, key) self.copy_image_data(source, sorted(img_keys)) class MPxDetectorTrainIterator: """Iterate over trains in detector data, assembling arrays. Created by :meth:`DetectorData.trains`. """ def __init__(self, data, pulses=by_index[:], require_all=True): self.data = data self.pulses = _check_pulse_selection(pulses) self.require_all = require_all # {(source, key): (f, dataset)} self._datasets_cache = {} def _find_data(self, source, key, tid): """ Find FileAccess instance and dataset corresponding to source, key, and train id tid. Parameters ---------- source: string Path to keys in HD5 file, e.g.: 'SPB_DET_AGIPD1M-1/DET/5CH0:xtdf'. key: string Key for data at source separated by dot, e.g.: 'image.data'. tid: np.int Train id. Returns ------- Tuple[FileAccess, int, h5py.Dataset] FileAccess Instance for the HD5 file with requested data. int Starting index for the requested data. h5py.Dataset h5py dataset with found data. """ file, ds = self._datasets_cache.get((source, key), (None, None)) if ds: ixs = (file.train_ids == tid).nonzero()[0] if ixs.size > 0: return file, ixs[0], ds data = self.data.data path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) f, pos = data._find_data(source, tid) if f is not None: ds = f.file[path] self._datasets_cache[(source, key)] = (f, ds) return f, pos, ds return None, None, None def _get_slow_data(self, source, key, tid): """ Get an array of slow (per train) data corresponding to source, key, and train id tid. Also used for JUNGFRAU data with memory cell dimension. Parameters ---------- source: string Path to keys in HD5 file, e.g.: 'SPB_DET_AGIPD1M-1/DET/5CH0:xtdf'. key: string Key for data at source separated by dot, e.g.: 'header.pulseCount'. tid: np.int Train id. Returns ------- xarray.DataArray Array of selected slow data. In case there are more than one frame for the train id tid - train id dimension is kept indexing frames within tid. """ from xarray import DataArray file, pos, ds = self._find_data(source, key, tid) if file is None: return None group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] if count == 1: return DataArray(ds[first]) else: return DataArray(ds[first : first + count]) def _get_pulse_data(self, source, key, tid): """ Get an array of per pulse data corresponding to source, key, and train id tid. Used only for AGIPD-like detectors, for JUNGFRAU-like per-cell data '_get_slow_data' is used. Parameters ---------- source: string Path to keys in HD5 file, e.g.: 'SPB_DET_AGIPD1M-1/DET/5CH0:xtdf'. key: string Key for data at source separated by dot, e.g.: 'image.data'. tid: np.int Train id. Returns ------- xarray.DataArray Array of selected per pulse data. """ from xarray import DataArray file, pos, ds = self._find_data(source, key, tid) if file is None: return None group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] pulse_ids = file.file['/INSTRUMENT/{}/{}/pulseId'.format(source, group)][ first : first + count ] # Raw files have a spurious extra dimension if pulse_ids.ndim >= 2 and pulse_ids.shape[1] == 1: pulse_ids = pulse_ids[:, 0] if isinstance(self.pulses, by_id): positions = self._select_pulse_ids(pulse_ids) elif isinstance(self.pulses, by_index): positions = self._select_pulse_indices(count) else: raise TypeError(f"Pulse selection should not be {type(self.pulses)}") pulse_ids = pulse_ids[positions] train_ids = np.array([tid] * len(pulse_ids), dtype=np.uint64) train_pulse_ids = self.data._make_image_index(train_ids, pulse_ids) if isinstance(positions, slice): data_positions = slice( int(first + positions.start), int(first + positions.stop), positions.step ) else: # ndarray data_positions = first + positions data = ds[data_positions] # Raw files have a spurious extra dimension if data.ndim >= 2 and data.shape[1] == 1: data = data[:, 0] dims = self.data[key].dimensions[1:] # excluding 'module' dim coords = {'train_pulse': train_pulse_ids} arr = DataArray(data, coords=coords, dims=dims) # Separate train & pulse dimensions, and arrange dimensions # so that the data is contiguous in memory. dim_order = train_pulse_ids.names + dims[1:] return arr.unstack('train_pulse').transpose(*dim_order) def _select_pulse_ids(self, pulse_ids): """Select pulses by ID Returns an array or slice of the indexes to include. """ val = self.pulses.value N = len(pulse_ids) if isinstance(val, slice): if val.step == 1: after_start = np.nonzero(pulse_ids >= val.start)[0] after_stop = np.nonzero(pulse_ids >= val.stop)[0] start_ix = after_start[0] if (after_start.size > 0) else N stop_ix = after_stop[0] if (after_stop.size > 0) else N return slice(start_ix, stop_ix) # step != 1 desired = np.arange(val.start, val.stop, step=val.step, dtype=np.uint64) else: desired = val return np.nonzero(np.isin(pulse_ids, desired))[0].astype(np.uint64) def _select_pulse_indices(self, count): """Select pulses by index Returns an array or slice of the indexes to include. """ val = self.pulses.value if isinstance(val, slice): return slice(val.start, min(val.stop, count), val.step) # ndarray return val[val < count] def _assemble_data(self, tid): """ Assemble data for all keys into a dictionary for specified train id. Parameters ---------- tid: int Train id. Returns ------- Dict[str, xarray]: str Key name. xarray Assembled data array. """ import xarray key_module_arrays = {} for modno, source in sorted(self.data.modno_to_source.items()): for key in self.data.data._keys_for_source(source): # At present, all the per-pulse data is stored in the 'image' key. # If that changes, this check will need to change as well. if key.startswith('image.'): mod_data = self._get_pulse_data(source, key, tid) else: mod_data = self._get_slow_data(source, key, tid) if mod_data is None: continue if key not in key_module_arrays: key_module_arrays[key] = [], [] modnos, data = key_module_arrays[key] modnos.append(modno) data.append(mod_data) # Assemble the data for each key into one xarray return { k: xarray.concat(data, pd.Index(modnos, name='module')) for (k, (modnos, data)) in key_module_arrays.items() } def __iter__(self): """ Iterate over train ids and yield assembled data dictionaries. Yields ------ Tuple[int, Dict[str, xarray]]: int train id. Dict[str, xarray] assembled {key: data array} dictionary. """ for tid in self.data.train_ids: tid = int(tid) # Convert numpy int to regular Python int if self.require_all and self.data.data._check_data_missing(tid): continue yield tid, self._assemble_data(tid) @multimod_detectors class AGIPD1M(XtdfDetectorBase): """An interface to AGIPD-1M data. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. detector_name: str, optional Name of a detector, e.g. 'SPB_DET_AGIPD1M-1'. This is only needed if the dataset includes more than one AGIPD detector. min_modules: int Include trains where at least n modules have data. Default is 1. raw: bool True to access raw data, False for corrected. The default is to use corrected if available & raw otherwise. """ _det_name_pat = r'[^/]+_AGIPD1M[^/]*' _source_raw_pat = r'/DET/(?P\d+)CH' _source_corr_pat = r'/CORR/(?P\d+)CH' module_shape = (512, 128) @classmethod def _data_is_raw(cls, data, source: str): # Raw AGIPD data has an extra dimension (data/gain) compared to raw kd = data[source, cls._main_data_key] return kd.ndim == 4 @multimod_detectors class AGIPD500K(XtdfDetectorBase): """An interface to AGIPD-500K data Detector names are like 'HED_DET_AGIPD500K2G', otherwise this is identical to :class:`AGIPD1M`. """ _det_name_pat = r'[^/]+AGIPD500K[^/]*' _source_raw_pat = r'/DET/(?P\d+)CH' _source_corr_pat = r'/CORR/(?P\d+)CH' module_shape = (512, 128) n_modules = 8 @classmethod def _data_is_raw(cls, data, source: str): # Raw AGIPD data has an extra dimension (data/gain) compared to raw kd = data[source, cls._main_data_key] return kd.ndim == 4 @multimod_detectors class DSSC1M(XtdfDetectorBase): """An interface to DSSC-1M data. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. detector_name: str, optional Name of a detector, e.g. 'SCS_DET_DSSC1M-1'. This is only needed if the dataset includes more than one DSSC detector. min_modules: int Include trains where at least n modules have data. Default is 1. raw: bool True to access raw data, False for corrected. The default is to use corrected if available & raw otherwise. """ _det_name_pat = r'[^/]+_DSSC1M[^/]*' _source_raw_pat = r'/DET/(?P\d+)CH' _source_corr_pat = r'/CORR/(?P\d+)CH' module_shape = (128, 512) @multimod_detectors class LPD1M(XtdfDetectorBase): """An interface to LPD-1M data. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. detector_name: str, optional Name of a detector, e.g. 'FXE_DET_LPD1M-1'. This is only needed if the dataset includes more than one LPD detector. min_modules: int Include trains where at least n modules have data. Default is 1. raw: bool True to access raw data, False for corrected. The default is to use corrected if available & raw otherwise. parallel_gain: bool Set to True to read this data as parallel gain data, where high, medium and low gain data are stored sequentially within each train. This will repeat the pulse & cell IDs from the first 1/3 of each train, and add gain stage labels from 0 (high-gain) to 2 (low-gain). """ _det_name_pat = r'[^/]+_LPD1M[^/]*' _source_raw_pat = r'/DET/(?P\d+)CH' _source_corr_pat = r'/CORR/(?P\d+)CH' module_shape = (256, 256) def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1, parallel_gain=False, raw=None): super().__init__( data, detector_name, modules, min_modules=min_modules, raw=raw ) self.parallel_gain = parallel_gain if parallel_gain: if ((self.frame_counts % 3) != 0).any(): raise ValueError( "parallel_gain=True needs the frames in each train to be divisible by 3" ) def _read_inner_ids(self, field='pulseId'): inner_ids = super()._read_inner_ids(field) if not self.parallel_gain: return inner_ids # In 'parallel gain' mode, the first 1/3 of pulse/cell IDs in each train # are valid, but the remaining 2/3 are junk. So we'll repeat the valid # ones 3 times (in inner_ids_fixed). inner_ids_fixed = np.zeros_like(inner_ids) cursor = 0 for count in self.frame_counts: # Iterate through trains n_per_gain_stage = int(count // 3) train_inner_ids = inner_ids[cursor: cursor + n_per_gain_stage] for stage in range(3): end = cursor + n_per_gain_stage inner_ids_fixed[cursor:end] = train_inner_ids cursor = end return inner_ids_fixed def _select_pulse_indices(self, pulses, counts): """Select pulses by index across a chunk of trains Returns a boolean array of frames to include. """ if not self.parallel_gain: return super()._select_pulse_indices(pulses, counts) sel_frames = np.zeros(counts.sum(), dtype=np.bool_) cursor = 0 for count in counts: n_per_gain_stage = int(count // 3) sel_in_train = pulses.value if isinstance(sel_in_train, np.ndarray): # Ignore any indices after the end of the gain stage sel_in_train = sel_in_train[sel_in_train < n_per_gain_stage] for stage in range(3): sel_frames[cursor:cursor + n_per_gain_stage][sel_in_train] = 1 cursor += n_per_gain_stage return sel_frames def _make_image_index(self, tids, inner_ids, inner_name='pulse'): if not self.parallel_gain: return super()._make_image_index(tids, inner_ids, inner_name) # In 'parallel gain' mode, the first 1/3 of pulse/cell IDs in each train # are valid, but the remaining 2/3 are junk. So we'll repeat the valid # ones 3 times (in inner_ids_fixed). At the same time, we make a gain # stage index (0-2), so each frame has a unique entry in the MultiIndex # (train ID, gain, pulse/cell ID) gain = np.zeros_like(inner_ids, dtype=np.uint8) inner_ids_fixed = np.zeros_like(inner_ids) _, firsts, counts = np.unique(tids, return_index=True, return_counts=True) for ix, frames in zip(firsts, counts): # Iterate through trains n_per_gain_stage = int(frames // 3) train_inner_ids = inner_ids[ix: ix + n_per_gain_stage] for stage in range(3): start = ix + (stage * n_per_gain_stage) end = start + n_per_gain_stage gain[start:end] = stage inner_ids_fixed[start:end] = train_inner_ids return pd.MultiIndex.from_arrays( [tids, gain, inner_ids_fixed], names=['train', 'gain', inner_name] ) @multimod_detectors class JUNGFRAU(MultimodDetectorBase): """An interface to JUNGFRAU data. JNGFR, JF1M, JF4M all store data in a "data" group, with trains along the first and memory cells along the second dimension. This allows only a set number of frames to be stored for each train. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. detector_name: str, optional Name of a detector, e.g. 'SPB_IRDA_JNGFR'. This is only needed if the dataset includes more than one JUNGFRAU detector. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. min_modules: int Include trains where at least n modules have data. Default is 1. n_modules: int Number of detector modules in the experiment setup. Default is None, in which case it will be estimated from the available data. first_modno: int The module number in the source name for the first detector module. e.g. FXE_XAD_JF500K/DET/JNGFR03:daqOutput should have first_modno = 3 raw: bool True to access raw data, False for corrected. The default is to use corrected if available & raw otherwise. """ # We appear to have a few different formats for source names: # SPB_IRDA_JNGFR/DET/MODULE_1:daqOutput (e.g. in p 2566, r 61) # SPB_IRDA_JF4M/DET/JNGFR03:daqOutput (e.g. in p 2732, r 12) # FXE_XAD_JF500K/DET/JNGFR03:daqOutput (e.g. in p 2478, r 52) # HED_IA1_JF500K1/DET/JNGFR01:daqOutput (e.g. in p 2656, r 230) # FXE_XAD_JF1M/DET/RECEIVER-1 _det_name_pat = r'[^/]+_(JNGFR|JF[14]M|JUNGF|JF|JF500K\d?)' _source_raw_pat = r'/DET/(MODULE_|RECEIVER-|JNGFR)(?P\d+)' _source_corr_pat = r'/CORR/(MODULE_|RECEIVER-|JNGFR)(?P\d+)' _main_data_key = 'data.adc' _mask_data_key = 'data.mask' _modnos_start_at = 1 module_shape = (512, 1024) def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1, n_modules=None, first_modno=1, raw=None): super().__init__(data, detector_name, modules, min_modules=min_modules, raw=raw) # Overwrite modno based on given starting module number and update # source_to_modno and modno_to_source. # JUNGFRAU modno is expected (e.g. extra_geom) to start with 1. self.source_to_modno = {s: (m - first_modno + 1) for (s, m) in self.source_to_modno.items()} self.modno_to_source = {m: s for (s, m) in self.source_to_modno.items()} if n_modules is not None: self.n_modules = int(n_modules) else: # Re-scan sources without modules= selection to find largest number self.n_modules = max( self._identify_sources(data, self.detector_name, raw=raw).values() ) - first_modno + 1 # In burst mode, JUNGFRAU can have 16 frames per train src = next(iter(self.source_to_modno)) self._frames_per_entry = self.data[src, self._main_data_key].entry_shape[0] @staticmethod def _label_dims(arr): # Label dimensions to match the AGIPD/DSSC/LPD data access ndim_pertrain = arr.ndim if 'trainId' in arr.dims: arr = arr.rename({'trainId': 'train'}) ndim_pertrain = arr.ndim - 1 if ndim_pertrain == 4: arr = arr.rename({ 'dim_0': 'cell', 'dim_1': 'slow_scan', 'dim_2': 'fast_scan' }) elif ndim_pertrain == 2: arr = arr.rename({'dim_0': 'cell'}) return arr def get_array(self, key, *, fill_value=None, roi=(), astype=None): """Get a labelled array of detector data Parameters ---------- key: str The data to get, e.g. 'data.adc' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. roi: tuple Specify e.g. ``np.s_[:, 10:60, 100:200]`` to select data within each module & each train when reading data. The first dimension is pulses, then there are two pixel dimensions. The same selection is applied to data from each module, so selecting pixels may only make sense if you're using a single module. astype: Type data type of the output array. If None (default) the dtype matches the input array dtype """ arr = super().get_array(key, fill_value=fill_value, roi=roi, astype=astype) return self._label_dims(arr) def get_dask_array(self, key, fill_value=None, astype=None): """Get a labelled Dask array of detector data Dask does lazy, parallelised computing, and can work with large data volumes. This method doesn't immediately load the data: that only happens once you trigger a computation. Parameters ---------- key: str The data to get, e.g. 'data.adc' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. astype: Type data type of the output array. If None (default) the dtype matches the input array dtype """ arr = super().get_dask_array(key, fill_value=fill_value, astype=astype) return self._label_dims(arr) def trains(self, require_all=True): """Iterate over trains for detector data. Parameters ---------- require_all: bool If True (default), skip trains where any of the selected detector modules are missing data. Yields ------ train_data: dict A dictionary mapping key names (e.g. 'data.adc') to labelled arrays. """ for tid, d in super().trains(require_all=require_all): yield tid, {k: self._label_dims(a) for (k, a) in d.items()} def write_virtual_cxi(self, filename, fillvalues=None): """Write a virtual CXI file to access the detector data. The virtual datasets in the file provide a view of the detector data as if it was a single huge array, but without copying the data. Creating and using virtual datasets requires HDF5 1.10. Parameters ---------- filename: str The file to be written. Will be overwritten if it already exists. fillvalues: dict, optional keys are datasets names (one of: data, gain, mask) and associated fill value for missing data (default is np.nan for float arrays and zero for integer arrays) """ JUNGFRAUCXIWriter(self).write(filename, fillvalues=fillvalues) def cell_ids(self): MISSING = 255 # To fit in uint8 cids = self.select_trains(np.s_[:1])['data.memoryCell'].ndarray( fill_value=MISSING )[:, 0] # -> (modules, cells) cells_min = cids.min(axis=0) if (cells_min == MISSING).any(): raise Exception(f"Failed to find memoryCell") cids[cids == MISSING] = 0 if (cells_min != cids.max(axis=0)).any(): raise Exception(f"Inconsistent memoryCell for different modules") # Pulse IDs make sense. Drop the modules dimension, giving one # pulse ID for each frame. return cells_min def identify_multimod_detectors( data, detector_name=None, *, single=False, clses=None ): """Identify multi-module detectors in the data Various detectors record data for individual X-ray pulses within trains, and we often want to process whichever detector was used in a run. This tries to identify the detector, so a user doesn't have to specify it manually. If ``single=True``, this returns a tuple of (detector_name, access_class), throwing ``ValueError`` if there isn't exactly 1 detector found. If ``single=False``, it returns a set of these tuples. *clses* may be a list of acceptable detector classes to check. """ if clses is None: clses = multimod_detectors.list res = set() for cls in clses: for name in cls._find_detector_names(data): res.add((name, cls)) if single: if len(res) < 1: raise ValueError("No detector sources identified in the data") elif len(res) > 1: raise ValueError("Multiple detectors identified: {}".format( ", ".join(name for (name, _) in res) )) return res.pop() return res ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/copy.py0000644000175100001660000001325514757376472017005 0ustar00runnerdockerimport sys from argparse import ArgumentParser from pathlib import Path from typing import Union import h5py from .utils import progress_bar __all__ = ["copy_structure"] def progress(processed, total, *, show=True): """Show progress information""" if not show: return pbar = progress_bar(processed, total) if sys.stderr.isatty(): # "\x1b[2K": delete whole line, "\x1b[1A": move up cursor print("\x1b[2K\x1b[1A\x1b[2K\x1b[1A", file=sys.stderr) print(pbar, file=sys.stderr) class Cloner: def __init__(self, input, output, *, run_data=False, control_data=False): self.run_data = run_data self.control_data = control_data self.visited = {} if output.file.mode == "r": raise ValueError("Output file must be writeable.") self.visit(input, output) @staticmethod def _copy_attrs(input, output): for key, value in input.attrs.items(): output.attrs.create(key, value) def visit(self, obj, output): if obj.name != "/": link = obj.file.get(obj.name, getlink=True) if isinstance(link, h5py.SoftLink): # note this works only for SoftLinks. ExternalLink object's # name is not the name of the path, but the targeted file's path output[obj.name] = link return obj_id = h5py.h5o.get_info(obj.id).addr if obj_id in self.visited: # Hardlink to an object we've already seen output[obj.name] = output[self.visited[obj_id]] return self.visited[obj_id] = obj.name if isinstance(obj, h5py.Dataset): if ( obj.name.startswith("/INSTRUMENT") or (obj.name.startswith("/CONTROL") and not self.control_data) or (obj.name.startswith("/RUN") and not self.run_data) ): output_obj = output.create_dataset_like(obj.name, obj) else: # note: consider using h5py.File.copy once a bug causing # segfault for dataset with attributes is fixed, # see: https://github.com/HDFGroup/hdf5/issues/2414 output_obj = output.create_dataset_like(obj.name, obj, data=obj[()]) self._copy_attrs(obj, output_obj) elif isinstance(obj, h5py.Group): if obj == obj.file: # root object output_obj = output["/"] else: output_obj = output.create_group(obj.name) self._copy_attrs(obj, output_obj) for name, child in obj.items(): if child.file != obj.file: # external link output[f'{obj.name}/{name}'] = obj.get(name, getlink=True) else: self.visit(child, output) else: # unknown type return def copy_structure( input: Union[Path, str], output: Union[Path, str], *, run_data=False, control_data=False, term_progress=False, ) -> None: """Clone EuXFEL HDF5 file structure without any of its data. Clone the input file or files present the input directory. The cloned files will be written to output. args: run_data: Copy data in RUN group if set to True control_data: Copy data in CONTROL group if set to True term_progress: show progress in terminal if set to True """ if isinstance(input, str): input = Path(input) input = input.expanduser() if isinstance(output, str): output = Path(output) output = output.expanduser() if not output.is_dir(): raise ValueError(f"The given output directory does not exist: {output}") if h5py.is_hdf5(input): if output == input.parent: raise ValueError("Input and output must be different directories.") Cloner( h5py.File(input), h5py.File(output / input.name, "w"), run_data=run_data, control_data=control_data, ) elif input.is_dir(): if output == input: raise ValueError("Input and output must be different directories.") # clone all hdf5 file present in the given directory h5files = [f for f in input.glob("*") if h5py.is_hdf5(f)] progress(0, len(h5files), show=term_progress) for n, file_ in enumerate(h5files, start=1): Cloner( h5py.File(file_), h5py.File(output / file_.name, "w"), run_data=run_data, control_data=control_data, ) progress(n, len(h5files), show=term_progress) else: raise ValueError(f"invalid input: {input}") def main(argv=None): ap = ArgumentParser("Clone EuXFEL HDF5 files but with empty datasets.") ap.add_argument("input", type=str, help="Path to an HDF5 file or a directory.") ap.add_argument( "output", type=str, help="Output directory to write the cloned files." ) ap.add_argument( "--copy-run-data", "-cr", action="store_true", default=False, help="Copy data present in the RUN group.", ) ap.add_argument( "--copy-control-data", "-cc", action="store_true", default=False, help="Copy data present in the CONTROL group.", ) args = ap.parse_args(argv) print(f"Cloning file(s) structure:\ninput: {args.input}\nOutput: {args.output}\n") copy_structure( args.input, args.output, run_data=args.copy_run_data, control_data=args.copy_control_data, term_progress=True, ) print("Done.") if __name__ == "__main__": main(sys.argv[1:]) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/exceptions.py0000644000175100001660000000331514757376472020210 0ustar00runnerdocker"""Exception classes specific to extra_data.""" class FileStructureError(Exception): pass class SourceNameError(KeyError): def __init__(self, source): self.source = source def __str__(self): return ( "This data has no source named {!r}.\n" "See data.all_sources for available sources.".format(self.source) ) class PropertyNameError(KeyError): def __init__(self, prop, source): self.prop = prop self.source = source def __str__(self): return "No property {!r} for source {!r}".format(self.prop, self.source) class TrainIDError(KeyError): def __init__(self, train_id): self.train_id = train_id def __str__(self): return "Train ID {!r} not found in this data".format(self.train_id) class AliasError(KeyError): def __init__(self, alias): self.alias = alias def __str__(self): return f"'{self.alias}' not known as alias for this data" class MultiRunError(ValueError): def __str__(self): return ( "The requested data is only available for a single run. This " "EXtra-data DataCollection may have data from multiple runs, e.g. " "because you have used .union() to combine data. Please retrieve " "this information before combining." ) class NoDataError(ValueError): def __init__(self, source, key=None): self.source = source self.key = key def __str__(self): if self.key is not None: return 'This data is empty for key {!r} of source {!r}'.format( self.key, self.source) else: return 'This data is empty for source {!r}'.format(self.source) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/export.py0000644000175100001660000001640114757376472017350 0ustar00runnerdocker# coding: utf-8 """Expose data to different interface ZMQStream explose to a ZeroMQ socket in a REQ/REP pattern. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ import os.path as osp import time from collections import deque from socket import AF_INET from warnings import warn from karabo_bridge import ServerInThread from karabo_bridge.server import Sender from psutil import net_if_addrs from .components import XtdfDetectorBase from .exceptions import SourceNameError from .reader import RunDirectory, H5File from .stacking import stack_detector_data __all__ = ['ZMQStreamer', 'serve_files'] def find_infiniband_ip(): """Find the first infiniband IP address :returns: str IP of the first infiniband interface if it exists else '*' """ addrs = net_if_addrs() for addr in addrs.get('ib0', ()): if addr.family == AF_INET: return addr.address return '*' class ZMQStreamer(ServerInThread): def __init__(self, port, sock='REP', maxlen=10, protocol_version='2.2', dummy_timestamps=False): warn("Please use :ref:karabo_bridge.ServerInThread instead", DeprecationWarning, stacklevel=2) endpoint = f'tcp://*:{port}' super().__init__(endpoint, sock=sock, maxlen=maxlen, protocol_version=protocol_version, dummy_timestamps=dummy_timestamps) def _iter_trains(data, merge_detector=False): """Iterate over trains in data and merge detector tiles in a single source :data: DataCollection :merge_detector: bool if True and data contains detector data (e.g. AGIPD) individual sources for each detector tiles are merged in a single source. The new source name keep the original prefix, but replace the last 2 part with '/DET/APPEND'. Individual sources are removed from the train data :yield: dict train data """ det, source_name = None, '' if merge_detector: for detector in XtdfDetectorBase.__subclasses__(): try: det = detector(data) source_name = f'{det.detector_name}/DET/APPEND' except SourceNameError: continue else: break for tid, train_data in data.trains(): if not train_data: continue if det is not None: det_data = { k: v for k, v in train_data.items() if k in det.data.detector_sources } # get one of the module to reference other datasets train_data[source_name] = mod_data = next(iter(det_data.values())) stacked = stack_detector_data(det_data, 'image.data') mod_data['image.data'] = stacked mod_data['metadata']['source'] = source_name if 'image.gain' in mod_data: stacked = stack_detector_data(det_data, 'image.gain') mod_data['image.gain'] = stacked if 'image.mask' in mod_data: stacked = stack_detector_data(det_data, 'image.mask') mod_data['image.mask'] = stacked # remove individual module sources for src in det.data.detector_sources: del train_data[src] yield tid, train_data def serve_files(path, port, source_glob='*', key_glob='*', **kwargs): """Stream data from files through a TCP socket. Parameters ---------- path: str Path to the HDF5 file or file folder. port: str or int A ZMQ endpoint (e.g. 'tcp://*:44444') or a TCP port to bind the socket to. Integers or strings of all digits are treated as port numbers. source_glob: str Only stream sources matching this glob pattern. Streaming data selectively is more efficient than streaming everything. key_glob: str Only stream keys matching this glob pattern in the selected sources. append_detector_modules: bool Combine multi-module detector data in a single data source (sources for individual modules are removed). The last section of the source name is replaces with 'APPEND', example: 'SPB_DET_AGIPD1M-1/DET/#CH0:xtdf' -> 'SPB_DET_AGIPD1M-1/DET/APPEND' Supported detectors: AGIPD, DSSC, LPD dummy_timestamps: bool Whether to add mock timestamps if the metadata lacks them. use_infiniband: bool Use infiniband interface if available (if port specifies a TCP port) sock: str socket type - supported: REP, PUB, PUSH (default REP). """ if osp.isdir(path): data = RunDirectory(path) else: data = H5File(path) data = data.select(source_glob, key_glob) serve_data(data, port, **kwargs) def serve_data(data, port, append_detector_modules=False, dummy_timestamps=False, use_infiniband=False, sock='REP'): """Stream data from files through a TCP socket. Parameters ---------- data: DataCollection The data to be streamed; should already have sources & keys selected. port: str or int A ZMQ endpoint (e.g. 'tcp://*:44444') or a TCP port to bind the socket to. Integers or strings of all digits are treated as port numbers. append_detector_modules: bool Combine multi-module detector data in a single data source (sources for individual modules are removed). The last section of the source name is replaces with 'APPEND', example: 'SPB_DET_AGIPD1M-1/DET/#CH0:xtdf' -> 'SPB_DET_AGIPD1M-1/DET/APPEND' Supported detectors: AGIPD, DSSC, LPD dummy_timestamps: bool Whether to add mock timestamps if the metadata lacks them. use_infiniband: bool Use infiniband interface if available (if port specifies a TCP port) sock: str socket type - supported: REP, PUB, PUSH (default REP). """ if isinstance(port, int) or port.isdigit(): endpt = f'tcp://{find_infiniband_ip() if use_infiniband else "*"}:{port}' else: endpt = port sender = Sender(endpt, sock=sock, dummy_timestamps=dummy_timestamps) print(f'Streamer started on: {sender.endpoint}') ntrains = len(data.train_ids) sent_times = deque([time.monotonic()], 10) count = 0 tid, rate = 0, 0. def print_update(end='\r'): print(f'Sent {count}/{ntrains} trains - Train ID {tid} - {rate:.1f} Hz', end=end) for tid, data in _iter_trains(data, merge_detector=append_detector_modules): sender.send(data) count += 1 new_time = time.monotonic() if count % 5 == 0: rate = len(sent_times) / (new_time - sent_times[0]) print_update() sent_times.append(new_time) print_update(end='\n') # The karabo-bridge code sets linger to 0 so that it doesn't get stuck if # the client goes away. But this would also mean that we close the socket # when the last messages have been queued but not sent. So if we've # successfully queued all the messages, set linger -1 (i.e. infinite) to # wait until ZMQ has finished transferring them before the socket is closed. sender.server_socket.close(linger=-1) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/file_access.py0000644000175100001660000005275714757376472020305 0ustar00runnerdocker"""Internal module for accessing EuXFEL HDF5 files This includes convenience features for getting the metadata & indexes from a file, as well as machinery to close less recently accessed files, so we don't run into the limit on the number of open files. """ from collections import defaultdict, OrderedDict import h5py, h5py.h5o import numpy as np import os import os.path as osp import re import resource from warnings import warn from weakref import WeakValueDictionary from .exceptions import FileStructureError, SourceNameError # Track all FileAccess objects - {path: FileAccess} file_access_registry = WeakValueDictionary() class OpenFilesLimiter(object): """ Working with FileAccess, keep the number of opened HDF5 files under the given limit by closing files accessed longest time ago. """ def __init__(self, maxfiles=128): self._maxfiles = maxfiles # We don't use the values, but OrderedDict is a handy as a queue # with efficient removal of entries by key. self._cache = OrderedDict() @property def maxfiles(self): return self._maxfiles @maxfiles.setter def maxfiles(self, maxfiles): """Set the new file limit and closes files over the limit""" self._maxfiles = maxfiles self.close_old_files() def _check_files(self): # Discard entries from self._cache if their FileAccess no longer exists self._cache = OrderedDict.fromkeys( path for path in self._cache if path in file_access_registry ) def n_open_files(self): self._check_files() return len(self._cache) def close_old_files(self): if len(self._cache) <= self.maxfiles: return # Now check how many paths still have an existing FileAccess object n = self.n_open_files() while n > self.maxfiles: path, _ = self._cache.popitem(last=False) file_access = file_access_registry.get(path, None) if file_access is not None: file_access.close() n -= 1 def touch(self, filename): """ Add/move the touched file to the end of the `cache`. If adding a new file takes it over the limit of open files, another file will be closed. For use of the file cache, FileAccess should use `touch(filename)` every time it provides the underlying instance of `h5py.File` for reading. """ if filename in self._cache: self._cache.move_to_end(filename) else: self._cache[filename] = None self.close_old_files() def closed(self, filename): """Discard a closed file from the cache""" self._cache.pop(filename, None) def init_open_files_limiter(): # Raise the limit for open files (1024 -> 4096 on Maxwell) nofile = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (nofile[1], nofile[1])) maxfiles = nofile[1] // 2 return OpenFilesLimiter(maxfiles) open_files_limiter = init_open_files_limiter() class MetaFileAccess(type): # Override regular instance creation to check in the registry first. # Defining __new__ on the class is not enough, because an object retrieved # from the registry that way will have its __init__ run again. def __call__(cls, filename, _cache_info=None): filename = osp.abspath(filename) inst = file_access_registry.get(filename, None) if inst is None: inst = file_access_registry[filename] = type.__call__( cls, filename, _cache_info ) return inst class FileAccess(metaclass=MetaFileAccess): """Access an EuXFEL HDF5 file. This does not necessarily keep the real file open, but opens it on demand. It assumes that the file is not changing on disk while this object exists. Parameters ---------- filename: str A path to an HDF5 file """ _file = None _format_version = None _path_infos = None _filename_infos = None metadata_fstat = None # Regular expressions to extract path and filename information for HDF5 # files saved on the EuXFEL computing infrastructure. euxfel_path_pattern = re.compile( # A path may have three different prefixes depending on the storage location. r'\/(gpfs\/exfel\/exp|gpfs\/exfel\/d|pnfs\/xfel.eu\/exfel\/archive\/XFEL)' # The first prefix above uses the pattern ////, # the other two use /// r'\/(\w+)\/(\d{6}|\w+)\/(\d{6}|p\d{6})\/(p\d{6}|[a-z]+)\/r\d{4}') euxfel_filename_pattern = re.compile(r'([A-Z]+)-R\d{4}-(\w+)-S(\d{5}).h5') def __init__(self, filename, _cache_info=None): self.filename = osp.abspath(filename) if _cache_info: self.train_ids = _cache_info['train_ids'] self.control_sources = _cache_info['control_sources'] self.instrument_sources = _cache_info['instrument_sources'] self.legacy_sources = _cache_info['legacy_sources'] self.validity_flag = _cache_info['flag'] else: try: tid_data = self.file['INDEX/trainId'][:] except KeyError: raise FileStructureError('INDEX/trainId dataset not found') self.train_ids = tid_data[tid_data != 0] self.control_sources, self.instrument_sources, self.legacy_sources \ = self._read_data_sources() self.validity_flag = None if self.validity_flag is None: if self.format_version == '0.5': self.validity_flag = self._guess_valid_trains() else: self.validity_flag = self.file['INDEX/flag'][:len(self.train_ids)].astype(bool) if self.format_version == '1.1': # File format version 1.1 changed the semantics of # INDEX/flag from a boolean flag to an index, with # the time server device being hardcoded to occur # at index 0. Inverting the flag after the boolean # cast above restores compatibility with format # version 1.0, with any "invalid" train having an # index >= 1, thus being casted to True and inverted # to False. Format 1.2 restored the 1.0 semantics. np.logical_not(self.validity_flag, out=self.validity_flag) warn( 'Train validation is not fully supported for data ' 'format version 1.1. If you have issues accessing ' 'these files, please contact da-support@xfel.eu.', stacklevel=2 ) if self._file is not None: # Store the stat of the file as it was when we read the metadata. # This is used by the run files map. self.metadata_fstat = os.stat(self.file.id.get_vfd_handle()) # {(file, source, group): (firsts, counts)} self._index_cache = {} # {source: set(keys)} self._keys_cache = {} self._run_keys_cache = {} # {source: set(keys)} - including incomplete sets self._known_keys = defaultdict(set) @property def file(self): open_files_limiter.touch(self.filename) # Local var to avoid a race condition when another thread calls .close() file = self._file if file is None: file = self._file = h5py.File(self.filename, 'r') return file def _evaluate_path_infos(self): m = self.euxfel_path_pattern.match(osp.dirname(osp.realpath(self.filename))) if m: if m[1] == 'gpfs/exfel/exp': self._path_infos = (m[5], m[2], m[3]) # ONC path. else: self._path_infos = (m[2], m[3], m[4]) # Maxwell path. else: self._path_infos = (None, None, None) def _evaluate_filename_infos(self): m = self.euxfel_filename_pattern.match(osp.basename(self.filename)) if m: self._filename_infos = (m[1], m[2], int(m[3])) else: self._filename_infos = (None, None, None) @property def storage_class(self): if self._path_infos is None: self._evaluate_path_infos() return self._path_infos[0] @property def instrument(self): if self._path_infos is None: self._evaluate_path_infos() return self._path_infos[1] @property def cycle(self): if self._path_infos is None: self._evaluate_path_infos() return self._path_infos[2] @property def data_category(self): if self._filename_infos is None: self._evaluate_filename_infos() return self._filename_infos[0] @property def aggregator(self): if self._filename_infos is None: self._evaluate_filename_infos() return self._filename_infos[1] @property def sequence(self): if self._filename_infos is None: self._evaluate_filename_infos() return self._filename_infos[2] @property def valid_train_ids(self): return self.train_ids[self.validity_flag] def has_train_ids(self, tids: np.ndarray, inc_suspect=False): f_tids = self.train_ids if inc_suspect else self.valid_train_ids return np.intersect1d(tids, f_tids).size > 0 def close(self): """Close* the HDF5 file this refers to. The file may not actually be closed if there are still references to objects from it, e.g. while iterating over trains. This is what HDF5 calls 'weak' closing. """ if self._file: self._file = None open_files_limiter.closed(self.filename) @property def format_version(self): if self._format_version is None: version_ds = self.file.get('METADATA/dataFormatVersion') if version_ds is not None: self._format_version = version_ds[0].decode('ascii') else: # The first version of the file format had no version number. # Numbering started at 1.0, so we call the first version 0.5. self._format_version = '0.5' return self._format_version def _read_data_sources(self): control_sources, instrument_sources, legacy_sources = set(), set(), dict() # The list of data sources moved in file format 1.0 if self.format_version == '0.5': data_sources_path = 'METADATA/dataSourceId' else: data_sources_path = 'METADATA/dataSources/dataSourceId' try: data_sources_group = self.file[data_sources_path] except KeyError: raise FileStructureError(f'{data_sources_path} not found') for source_id in data_sources_group[:]: if not source_id: continue source_id = source_id.decode() category, _, h5_source = source_id.partition('/') if category == 'INSTRUMENT': device, _, chan_grp = h5_source.partition(':') chan, _, group = chan_grp.partition('/') source = device + ':' + chan if source not in instrument_sources: # The legacy source name is only expected to be used # for instrument (more precisely, XTDF sources) for # now. For performance reasons, the check is # therefore only performed here, and only once rather # than by index group. item = self.file.get(f'{category}/{source}', getlink=True) if isinstance(item, h5py.SoftLink): legacy_sources[source] = item.path[1:].partition('/')[2] instrument_sources.add(source) # TODO: Do something with groups? elif category == 'CONTROL': control_sources.add(h5_source) elif category == 'Karabo_TimerServer': # Ignore virtual data source used only in file format # version 1.1 / pclayer-1.10.3-2.10.5. pass else: raise ValueError("Unknown data category %r" % category) return frozenset(control_sources), frozenset(instrument_sources), \ legacy_sources def _guess_valid_trains(self): # File format version 1.0 includes a flag which is 0 if a train ID # didn't come from the time server. We use this to skip bad trains, # especially for AGIPD. # Older files don't have this flag, so this tries to estimate validity. # The goal is to have a monotonic sequence within the file with the # fewest trains skipped. train_ids = self.train_ids flag = np.ones_like(train_ids, dtype=bool) for ix in np.nonzero(train_ids[1:] <= train_ids[:-1])[0]: # train_ids[ix] >= train_ids[ix + 1] invalid_before = train_ids[:ix+1] >= train_ids[ix+1] invalid_after = train_ids[ix+1:] <= train_ids[ix] # Which side of the downward jump in train IDs would need fewer # train IDs invalidated? if np.count_nonzero(invalid_before) < np.count_nonzero(invalid_after): flag[:ix+1] &= ~invalid_before else: flag[ix+1:] &= ~invalid_after return flag def __hash__(self): return hash(self.filename) def __eq__(self, other): return isinstance(other, FileAccess) and (other.filename == self.filename) def __repr__(self): return "{}({})".format(type(self).__name__, repr(self.filename)) # This is a staticmethod rather than a classmethod, because cloudpickle # (used in Dask, clusterfutures, etc.) tries to save bound classmethods # 'by value', which includes the registry, and that goes wrong. # The staticmethod is pickled 'by reference', avoiding that issue. @staticmethod def _from_pickle(filename, state): """Called when an instance is loaded from a pickle""" inst = file_access_registry.get(filename, None) if inst is None: inst = file_access_registry[filename] = FileAccess.__new__(FileAccess) inst.__dict__.update(state) return inst def __reduce__(self): """Called when a `FileAccess` instance is pickled""" state = self.__dict__.copy() state['_file'] = None return self._from_pickle, (self.filename, state) @property def all_sources(self): return self.control_sources | self.instrument_sources def get_index(self, source, group): """Get first index & count for a source and for a specific train ID. Indices are cached; this appears to provide some performance benefit. """ try: return self._index_cache[(source, group)] except KeyError: ix = self._read_index(source, group) self._index_cache[(source, group)] = ix return ix def _read_index(self, source, group): """Get first index & count for a source. This is 'real' reading when the requested index is not in the cache. """ ntrains = len(self.train_ids) ix_group = self.file['/INDEX/{}/{}'.format(source, group)] firsts = ix_group['first'][:ntrains] if 'count' in ix_group: counts = ix_group['count'][:ntrains] else: status = ix_group['status'][:ntrains] counts = np.uint64((ix_group['last'][:ntrains] - firsts + 1) * status) return firsts, counts def index_groups(self, source): """Get group names for index data, to use with .get_index()""" if source in self.control_sources: return {''} elif source in self.instrument_sources: return set(self.file[f'/INDEX/{source}'].keys()) else: raise SourceNameError(source) def metadata(self) -> dict: """Get the contents of the METADATA group as a dict Not including the lists of data sources """ if self.format_version == '0.5': # Pretend this is actually there, like format version 1.0 return {'dataFormatVersion': '0.5'} r = {} for k, ds in self.file['METADATA'].items(): if not isinstance(ds, h5py.Dataset): continue v = ds[0] if isinstance(v, bytes): v = v.decode('utf-8', 'surrogateescape') r[k] = v return r def get_keys(self, source): """Get keys for a given source name Keys are found by walking the HDF5 file, and cached for reuse. """ try: return self._keys_cache[source] except KeyError: pass if source in self.control_sources: group = '/CONTROL/' + source elif source in self.instrument_sources: group = '/INSTRUMENT/' + source else: raise SourceNameError(source) res = set() def add_key(key, value): if isinstance(value, h5py.Dataset): res.add(key.replace('/', '.')) self.file[group].visititems(add_key) self._keys_cache[source] = res return res def get_one_key(self, source, index_group=None): """Similar to get_keys(), except it returns only a single key for performance""" # Use empty prefix if no index group filter is active. prefix = index_group + '.' if index_group else '' if source in self._keys_cache: for key in self._keys_cache[source]: if key.startswith(prefix): return key # _keys_cache is a complete set, so this point can only be # reached for a key-less source (currently assumed to no # exist) or a non-existing index group was passed. raise ValueError(f'{index_group} not an index group of `{source}`') if self._known_keys[source]: for key in self._known_keys[source]: if key.startswith(prefix): return key if source in self.control_sources: root = 'CONTROL' elif source in self.instrument_sources: root = 'INSTRUMENT' else: raise SourceNameError(source) group = f'/{root}/{source}' if index_group: group += '/' + index_group def get_key(subkey, value): if isinstance(value, h5py.Dataset): return prefix + subkey.replace('/', '.') try: h5_group = self.file[group] except KeyError: # Can only happen for missing index groups, as missing # sources are handled above already. raise ValueError(f'{index_group} not an index group of `{source}`') else: return h5_group.visititems(get_key) def get_run_keys(self, source): """Get the keys in the RUN section for a given control source name Keys are found by walking the HDF5 file, and cached for reuse. """ try: return self._run_keys_cache[source] except KeyError: pass if source not in self.control_sources: raise SourceNameError(source) res = set() def add_key(key, value): if isinstance(value, h5py.Dataset): res.add(key.replace('/', '.')) self.file['/RUN/' + source].visititems(add_key) self._run_keys_cache[source] = res return res def has_source_key(self, source, key): """Check if the given source and key exist in this file This doesn't scan for all the keys in the source, as .get_keys() does. """ try: return key in self._keys_cache[source] except KeyError: pass if key in self._known_keys[source]: return True if source in self.control_sources: path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/')) elif source in self.instrument_sources: path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) else: raise SourceNameError(source) # self.file.get(path, getclass=True) works, but is weirdly slow. # Checking like this is much faster. if (path in self.file) and isinstance( h5py.h5o.open(self.file.id, path.encode()), h5py.h5d.DatasetID ): self._known_keys[source].add(key) return True return False def dset_proxy(self, ds_path: str): return DatasetProxy(self, ds_path) class DatasetProxy: """A picklable reference to an HDF5 dataset, suitable for dask.array Dask tries to do this automatically for h5py Dataset objects, but with some limitations: - It only works with Dask distributed, not Dask's local schedulers. - Dask storing references to h5py Datasets keeps the files open, breaking our attempts to manage the number of open files. """ def __init__(self, file_acc: FileAccess, ds_path: str): # We could just store the file name and use h5py on demand, but storing # our FileAccess object lets it use our cache of open files. self.file_acc = file_acc self.ds_path = ds_path ds = file_acc.file[ds_path] # dask.array expects these three array-like attributes: self.shape = ds.shape self.ndim = ds.ndim self.dtype = ds.dtype def __getitem__(self, item): return self.file_acc.file[self.ds_path][item] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/keydata.py0000644000175100001660000005545314757376472017463 0ustar00runnerdockerfrom typing import List, Optional, Tuple import h5py import numpy as np from .exceptions import TrainIDError, NoDataError from .file_access import FileAccess from .read_machinery import ( contiguous_regions, DataChunk, select_train_ids, split_trains, roi_shape, trains_files_index, ) class KeyData: """Data for one key in one source Don't create this directly; get it from ``run[source, key]``. """ def __init__( self, source, key, *, train_ids, files, section, dtype, eshape, inc_suspect_trains=True, ): self.source = source self.key = key self.train_ids = train_ids self.files: List[FileAccess] = files self.section = section self.dtype = dtype self.entry_shape = eshape self.ndim = len(eshape) + 1 self.inc_suspect_trains = inc_suspect_trains def _find_chunks(self): """Find contiguous chunks of data for this key, in any order.""" all_tids_arr = np.array(self.train_ids) for file in self.files: if len(file.train_ids) == 0: continue firsts, counts = file.get_index(self.source, self.index_group) # Of trains in this file, which are in selection include = np.isin(file.train_ids, all_tids_arr) if not self.inc_suspect_trains: include &= file.validity_flag # Assemble contiguous chunks of data from this file for _from, _to in contiguous_regions(include): yield DataChunk( file, self.hdf5_data_path, first=firsts[_from], train_ids=file.train_ids[_from:_to], counts=counts[_from:_to], ) _cached_chunks = None @property def _data_chunks(self) -> List[DataChunk]: """An ordered list of chunks containing data""" if self._cached_chunks is None: self._cached_chunks = sorted( self._find_chunks(), key=lambda c: c.train_ids[0] ) return self._cached_chunks @property def _data_chunks_nonempty(self) -> List[DataChunk]: return [c for c in self._data_chunks if c.total_count] def __repr__(self): return f"" @property def is_control(self): """Whether this key belongs to a control source.""" return self.section == 'CONTROL' @property def is_instrument(self): """Whether this key belongs to an instrument source.""" return self.section == 'INSTRUMENT' @property def index_group(self): """The part of the key needed to look up index data""" if self.section == 'INSTRUMENT': return self.key.partition('.')[0] else: return '' @property def hdf5_data_path(self): """The path to the relevant dataset within each HDF5 file""" return f"/{self.section}/{self.source}/{self.key.replace('.', '/')}" @property def shape(self): """The shape of this data as a tuple, like for a NumPy array. Finding the shape may require getting index data from several files """ return (sum(c.total_count for c in self._data_chunks),) + self.entry_shape @property def nbytes(self): """The number of bytes this data would take up in memory.""" return self.dtype.itemsize * np.prod(self.shape) @property def size_mb(self): """The size of the data in memory in megabytes.""" return self.nbytes / 1e6 @property def size_gb(self): """The size of the data in memory in gigabytes.""" return self.nbytes / 1e9 @property def units(self): """The units symbol for this data, e.g. 'μJ', or None if not found""" attrs = self.attributes() base_unit = attrs.get('unitSymbol', None) if base_unit is None: return None prefix = attrs.get('metricPrefixSymbol', '') if prefix == 'u': prefix = 'μ' # We are not afraid of unicode return prefix + base_unit @property def units_name(self): """The units name for this data, e.g. 'microjoule', or None if not found""" attrs = self.attributes() base_unit = attrs.get('unitName', None) if base_unit is None: return None prefix = attrs.get('metricPrefixName', '') return prefix + base_unit @property def source_file_paths(self): paths = [] for chunk in self._data_chunks: if chunk.dataset.is_virtual: mappings = chunk.dataset.virtual_sources() for vspace, filename, _, _ in mappings: if filename in paths: continue # Already got it # Does the mapping overlap with this chunk of selected data? # We can assume that each mapping is a simple, contiguous # block, and only selection on the first dimension matters. starts, ends = vspace.get_select_bounds() map_start, map_stop = starts[0], ends[0] ck = chunk.slice if (map_stop > ck.start) and (map_start < ck.stop): paths.append(filename) # Include 1 source file even if no trains are selected if (not paths) and mappings: paths.append(mappings[0].file_name) else: paths.append(chunk.file.filename) # Fallback for virtual overview files where no data was recorded for # this source, so there's no mapping to point to. if not paths: source_grp = self.files[0].file[f"{self.section}/{self.source}"] if 'source_files' in source_grp.attrs: paths.append(source_grp.attrs['source_files'][0]) from pathlib import Path return [Path(p) for p in paths] def _find_attributes(self, dset): """Find Karabo attributes belonging to a dataset.""" attrs = dict(dset.attrs) if self.is_control and self.key.endswith('.value'): # For CONTROL sources, most of the attributes are saved on # the parent group rather than the .value dataset. In the # case of duplicated keys, the parent value appears to be # the correct one. attrs.update(dict(dset.parent.attrs)) return attrs def attributes(self): """Get a dict of all attributes stored with this data This may be awkward to use. See .units and .units_name for more convenient forms. """ dset = self.files[0].file[self.hdf5_data_path] attrs = self._find_attributes(dset) if (not attrs) and dset.is_virtual: # Virtual datasets were initially created without these attributes. # Find a source file. Not using source_file_paths as it can give []. _, filename, _, _ = dset.virtual_sources()[0] # Not using FileAccess: no need for train or source lists. with h5py.File(filename, 'r') as f: attrs = self._find_attributes(f[self.hdf5_data_path]) return attrs def select_trains(self, trains): """Select a subset of trains in this data as a new :class:`KeyData` object. Also available by slicing and indexing the KeyData object:: run[source, key][:10] # Select data for first 10 trains """ tids = select_train_ids(self.train_ids, trains) return self._only_tids(tids) def __getitem__(self, item): return self.select_trains(item) __iter__ = None # Disable iteration def _only_tids(self, tids, files=None): tids_arr = np.array(tids) if files is None: files = [ f for f in self.files if f.has_train_ids(tids_arr, self.inc_suspect_trains) ] if not files: # Keep 1 file, even if 0 trains selected. files = [self.files[0]] return KeyData( self.source, self.key, train_ids=tids, files=files, section=self.section, dtype=self.dtype, eshape=self.entry_shape, inc_suspect_trains=self.inc_suspect_trains, ) def drop_empty_trains(self): """Select only trains with data as a new :class:`KeyData` object.""" counts = self.data_counts(labelled=False) tids = np.array(self.train_ids)[counts > 0] return self._only_tids(list(tids)) def split_trains(self, parts=None, trains_per_part=None): """Split this data into chunks with a fraction of the trains each. Either *parts* or *trains_per_part* must be specified. This returns an iterator yielding new :class:`KeyData` objects. The parts will have similar sizes, e.g. splitting 11 trains with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3. Selected trains count even if they are missing data, so different keys from the same run can be split into matching chunks. Parameters ---------- parts: int How many parts to split the data into. If trains_per_part is also specified, this is a minimum, and it may make more parts. It may also make fewer if there are fewer trains in the data. trains_per_part: int A maximum number of trains in each part. Parts will often have fewer trains than this. """ # tids_files points to the file for each train. # This avoids checking all files for each chunk, which can be slow. tids_files = trains_files_index( self.train_ids, self.files, self.inc_suspect_trains ) for sl in split_trains(len(self.train_ids), parts, trains_per_part): tids = self.train_ids[sl] files = set(tids_files[sl]) - {None} files = sorted(files, key=lambda f: f.filename) yield self._only_tids(tids, files=files) def data_counts(self, labelled=True): """Get a count of data entries in each train. If *labelled* is True, returns a pandas series with an index of train IDs. Otherwise, returns a NumPy array of counts to match ``.train_ids``. """ if self._data_chunks: train_ids = np.concatenate([c.train_ids for c in self._data_chunks]) counts = np.concatenate([c.counts for c in self._data_chunks]) else: train_ids = counts = np.zeros(0, dtype=np.uint64) if labelled: import pandas as pd return pd.Series(counts, index=train_ids) else: all_tids_arr = np.array(self.train_ids) res = np.zeros(len(all_tids_arr), dtype=np.uint64) tid_to_ix = np.intersect1d(all_tids_arr, train_ids, return_indices=True)[1] # We may be missing some train IDs, if they're not in any file # for this source, and they're sometimes out of order within chunks # (they shouldn't be, but we try not to fail too badly if they are). assert len(tid_to_ix) == len(train_ids) res[tid_to_ix] = counts return res def as_single_value(self, rtol=1e-5, atol=0.0, reduce_by='median'): """Retrieve a single reduced value if within tolerances. The relative and absolute tolerances *rtol* and *atol* work the same way as in ``numpy.allclose``. The default relative tolerance is 1e-5 with no absolute tolerance. The data for this key is compared against a reduced value obtained by the method described in *reduce_by*. This may be a callable taking the key data, the string value of a global symbol in the numpy packge such as 'median' or 'first' to use the first value encountered. By default, 'median' is used. If within tolerances, the reduced value is returned. """ data = self.ndarray() if len(data) == 0: raise NoDataError(self.source, self.key) if callable(reduce_by): value = reduce_by(data) elif isinstance(reduce_by, str) and hasattr(np, reduce_by): value = getattr(np, reduce_by)(data, axis=0) elif reduce_by == 'first': value = data[0] else: raise ValueError('invalid reduction method (may be callable, ' 'global numpy symbol or "first")') if not np.allclose(data, value, rtol=rtol, atol=atol): adev = np.max(np.abs(data - value)) rdev = np.max(np.abs(adev / value)) raise ValueError(f'data values are not within tolerance ' f'(absolute: {adev:.3g}, relative: {rdev:.3g})') return value # Getting data as different kinds of array: ------------------------------- def ndarray(self, roi=(), out=None): """Load this data as a numpy array *roi* may be a ``numpy.s_[]`` expression to load e.g. only part of each image from a camera. If *out* is not given, a suitable array will be allocated. """ if not isinstance(roi, tuple): roi = (roi,) req_shape = self.shape[:1] + roi_shape(self.entry_shape, roi) if out is None: out = np.empty(req_shape, dtype=self.dtype) elif out is not None and out.shape != req_shape: raise ValueError(f'requires output array of shape {req_shape}') # Read the data from each chunk into the result array dest_cursor = 0 for chunk in self._data_chunks_nonempty: dest_chunk_end = dest_cursor + chunk.total_count slices = (chunk.slice,) + roi chunk.dataset.read_direct( out[dest_cursor:dest_chunk_end], source_sel=slices ) dest_cursor = dest_chunk_end return out def train_id_coordinates(self): """Make an array of train IDs to use alongside data from ``.ndarray()``. :attr:`train_ids` includes each selected train ID once, including trains where data is missing. :meth:`train_id_coordinates` excludes missing trains, and repeats train IDs if the source has multiple entries per train. The result will be the same length as the first dimension of an array from :meth:`ndarray`, and tells you which train each entry belongs to. .. seealso:: :meth:`xarray` returns a labelled array including these train IDs. """ if not self._data_chunks: return np.zeros(0, dtype=np.uint64) chunks_trainids = [ np.repeat(chunk.train_ids, chunk.counts.astype(np.intp)) for chunk in self._data_chunks ] return np.concatenate(chunks_trainids) def xarray(self, extra_dims=None, roi=(), name=None): """Load this data as a labelled xarray array or dataset. The first dimension is labelled with train IDs. Other dimensions may be named by passing a list of names to *extra_dims*. For scalar datatypes, an xarray.DataArray is returned using either the supplied *name* or the concatenated source and key name if omitted. If the data is stored in a structured datatype, an xarray.Dataset is returned with a variable for each field. The data of these variables will be non-contiguous in memory, use `Dataset.copy(deep=true)` to obtain a contiguous copy. Parameters ---------- extra_dims: list of str Name extra dimensions in the array. The first dimension is automatically called 'train'. The default for extra dimensions is dim_0, dim_1, ... roi: numpy.s_[], slice, tuple of slices, or by_index The region of interest. This expression selects data in all dimensions apart from the first (trains) dimension. If the data holds a 1D array for each entry, roi=np.s_[:8] would get the first 8 values from every train. If the data is 2D or more at each entry, selection looks like roi=np.s_[:8, 5:10] . name: str Name the array itself. The default is the source and key joined by a dot. Ignored for structured data when a dataset is returned. """ import xarray ndarr = self.ndarray(roi=roi) # Dimension labels if extra_dims is None: extra_dims = ['dim_%d' % i for i in range(ndarr.ndim - 1)] dims = ['trainId'] + extra_dims # Train ID index coords = {'trainId': self.train_id_coordinates()} # xarray attributes attrs = {} if (units := self.units): attrs['units'] = units if ndarr.dtype.names is not None: # Structured dtype. return xarray.Dataset( {field: (dims, ndarr[field]) for field in ndarr.dtype.names}, coords=coords, attrs=attrs) else: if name is None: name = f'{self.source}.{self.key}' if name.endswith('.value') and self.section == 'CONTROL': name = name[:-6] # Primitive dtype. return xarray.DataArray( ndarr, dims=dims, coords=coords, name=name, attrs=attrs) def series(self): """Load this data as a pandas Series. Only for 1D data. """ import pandas as pd if self.ndim > 1: raise TypeError("pandas Series are only available for 1D data") name = self.source + '/' + self.key if name.endswith('.value') and self.section == 'CONTROL': name = name[:-6] index = pd.Index(self.train_id_coordinates(), name='trainId') data = self.ndarray() return pd.Series(data, name=name, index=index) def dask_array(self, labelled=False): """Make a Dask array for this data. Dask is a system for lazy parallel computation. This method doesn't actually load the data, but gives you an array-like object which you can operate on. Dask loads the data and calculates results when you ask it to, e.g. by calling a ``.compute()`` method. See the Dask documentation for more details. If your computation depends on reading lots of data, consider creating a dask.distributed.Client before calling this. If you don't do this, Dask uses threads by default, which is not efficient for reading HDF5 files. Parameters ---------- labelled: bool If True, label the train IDs for the data, returning an xarray.DataArray object wrapping a Dask array. """ import dask.array as da chunks_darrs = [] for chunk in self._data_chunks_nonempty: chunk_dim0 = chunk.total_count chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:] itemsize = chunk.dataset.dtype.itemsize # Find chunk size of maximum 2 GB. This is largely arbitrary: # we want chunks small enough that each worker can have at least # a couple in memory (Maxwell nodes have 256-768 GB in late 2019). # But bigger chunks means less overhead. # Empirically, making chunks 4 times bigger/smaller didn't seem to # affect speed dramatically - but this could depend on many factors. # TODO: optional user control of chunking limit = 2 * 1024 ** 3 while np.prod(chunk_shape) * itemsize > limit and chunk_dim0 > 1: chunk_dim0 //= 2 chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:] chunks_darrs.append( da.from_array( chunk.file.dset_proxy(chunk.dataset_path), chunks=chunk_shape )[chunk.slice] ) if chunks_darrs: dask_arr = da.concatenate(chunks_darrs, axis=0) else: shape = (0,) + self.entry_shape dask_arr = da.zeros(shape=shape, dtype=self.dtype, chunks=shape) if labelled: # Dimension labels dims = ['trainId'] + ['dim_%d' % i for i in range(dask_arr.ndim - 1)] # Train ID index coords = {'trainId': self.train_id_coordinates()} import xarray return xarray.DataArray(dask_arr, dims=dims, coords=coords) else: return dask_arr # Getting data by train: -------------------------------------------------- def _find_tid(self, tid) -> Tuple[Optional[FileAccess], int]: for fa in self.files: matches = (fa.train_ids == tid).nonzero()[0] if self.inc_suspect_trains and matches.size > 0: return fa, matches[0] for ix in matches: if fa.validity_flag[ix]: return fa, ix return None, 0 def train_from_id(self, tid, keep_dims=False): """Get data for the given train ID as a numpy array. Returns (train ID, array) """ if tid not in self.train_ids: raise TrainIDError(tid) fa, ix = self._find_tid(tid) if fa is None: return np.empty((0,) + self.entry_shape, dtype=self.dtype) firsts, counts = fa.get_index(self.source, self.index_group) first, count = firsts[ix], counts[ix] if count == 1 and not keep_dims: return tid, fa.file[self.hdf5_data_path][first] else: return tid, fa.file[self.hdf5_data_path][first: first+count] def train_from_index(self, i, keep_dims=False): """Get data for a train by index (starting at 0) as a numpy array. Returns (train ID, array) """ return self.train_from_id(self.train_ids[i], keep_dims=keep_dims) def trains(self, keep_dims=False, include_empty=False): """Iterate through trains containing data for this key Yields pairs of (train ID, array). Train axis is removed in case of single elements unless *keep_dims* is set. Skips trains where data is missing unless *include_empty* is set, returning None or zero-length array with *keep_dims*. """ if keep_dims and include_empty: empty_result = np.zeros(shape=(0,) + self.entry_shape, dtype=self.dtype) else: empty_result = None for chunk in self._data_chunks_nonempty: start = chunk.first ds = chunk.dataset for tid, count in zip(chunk.train_ids, chunk.counts): if count > 1 or keep_dims: yield tid, ds[start: start+count] elif count == 1: yield tid, ds[start] elif include_empty: yield tid, empty_result start += count ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/locality.py0000644000175100001660000000766514757376472017663 0ustar00runnerdocker""" Tools to check a file locality at EuXFEL May be used to avoiding hangs on reading files from dCache if they are not available or stored only on tape """ import os import sys from collections import defaultdict import multiprocessing as mp UNAVAIL = 1 ONTAPE = 2 ONDISK = 4 ANY = 7 DC_LOC_RESP = { 'UNAVAILABLE': UNAVAIL, 'NEARLINE': ONTAPE, 'ONLINE': ONDISK, 'ONLINE_AND_NEARLINE': ONTAPE | ONDISK, 'NOT_ON_DCACHE': ONDISK, } LOCMSG = { 0: 'Unknown locality', 1: 'Unavailable', 2: 'Only on tape', 4: 'On disk', 6: 'On disk', } def get_locality(path): """ Returns locality of the file (path) """ basedir, filename = os.path.split(path) dotcmd = os.path.join(basedir, '.(get)({})(locality)'.format(filename)) try: with open(dotcmd, 'r') as f: return path, f.read().strip() except FileNotFoundError: return path, 'NOT_ON_DCACHE' def list_locality(files): """ Returns locality of the list of files """ with mp.Pool() as p: yield from p.imap_unordered(get_locality, files) def print_counts(fpart): """ Prints the counters of different localities """ n_ondisk = len(fpart['NOT_ON_DCACHE']) + len(fpart['ONLINE_AND_NEARLINE']) + len(fpart['ONLINE']) n_ontape = len(fpart['NEARLINE']) n_unavail = len(fpart['UNAVAILABLE']) print(f"{n_ondisk} on disk, {n_ontape} only on tape, {n_unavail} unavailable ", end='\r') def silent(fpart): """ Prints nothing """ pass def partition(files, cb_disp=silent): """ Partition files by locality """ fpart = defaultdict(list) for path, loc in list_locality(files): fpart[loc].append(path) cb_disp(fpart) return fpart def lc_match(files, accept=ONDISK): """ Returns files which has accepted locality """ filtered = [] for path, loc in list_locality(files): code = DC_LOC_RESP.get(loc, 0) if code & accept: filtered.append(path) else: print(f"Skipping file {path}", file=sys.stderr) print(f" ({LOCMSG[code]})", file=sys.stderr) return filtered def lc_any(files): """ Returns all files, does nothing """ return files def lc_ondisk(files): """Returns files on disk, excluding any which would be read from tape""" return lc_match(files, ONDISK) def lc_avail(files): """Returns files which are available on disk or tape Excludes files which dCache reports are unavailable. """ return lc_match(files, ONTAPE | ONDISK) def check_dir(basedir): """ Check basedir and prints results """ if os.path.isdir(basedir): ls = ( os.path.join(basedir, f) for f in os.listdir(basedir) ) files = [ f for f in ls if os.path.isfile(f) ] elif os.path.isfile(basedir): files = [ basedir ] else: files = [] print(f"Checking {len(files)} files in {basedir}") fp = partition(files, print_counts) print("") retcode = 0 if fp['NEARLINE']: retcode |= 1 print("Only on tape:") for file in sorted(fp['NEARLINE']): print(f" {file}") if fp['UNAVAILABLE']: retcode |= 2 print("Unavailable:") for file in sorted(fp['UNAVAILABLE']): print(f" {file}") unknown_locality = set(fp) - set(DC_LOC_RESP) if unknown_locality: retcode |= 4 print("Unknown locality:", unknown_locality) return retcode from argparse import ArgumentParser def main(argv=None): if argv is None: argv = sys.argv[1:] ap = ArgumentParser(prog='extra-data-locality', description="Checks locality of files in the directory") ap.add_argument('path', help="run directory of HDF5 files.") args = ap.parse_args(argv) if not os.path.exists(args.path): print(f"Path '{args.path}' is not found") return 255 return check_dir(args.path) if __name__ == "__main__": sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/lsxfel.py0000644000175100001660000001313414757376472017324 0ustar00runnerdocker"""Summarise XFEL data in files or folders """ import argparse from collections import defaultdict import os import os.path as osp import re import sys from .read_machinery import FilenameInfo from .reader import H5File, RunDirectory def describe_file(path, details_for_sources=()): """Describe a single HDF5 data file""" basename = os.path.basename(path) print(basename, ": Data file") h5file = H5File(path) h5file.info(details_for_sources) def summarise_file(path): basename = os.path.basename(path) print(basename, ": Data file") f = H5File(path) print(f" {len(f.train_ids)} trains, {len(f.all_sources)} sources") def describe_run(path, details_for_sources=()): basename = os.path.basename(path) print(basename, ": Run directory") print() run = RunDirectory(path) run.info(details_for_sources) def summarise_run(path, indent=''): basename = os.path.basename(path) # Accessing all the files in a run can be slow. To get the number of trains, # pick one set of segments (time slices of data from the same source). # This relies on each set of segments recording the same number of trains. segment_sequences = defaultdict(list) n_detector = n_other = 0 for f in sorted(os.listdir(path)): m = re.match(r'(.+)-S\d+\.h5', osp.basename(f)) if m: segment_sequences[m.group(1)].append(f) if FilenameInfo(f).is_detector: n_detector += 1 else: n_other += 1 if len(segment_sequences) < 1: raise ValueError("No data files recognised in %s" % path) # Take the shortest group of segments to make reading quicker first_group = sorted(segment_sequences.values(), key=len)[0] train_ids = set() for f in first_group: train_ids.update(H5File(osp.join(path, f)).train_ids) print("{}{} : Run of {:>4} trains, with {:>3} detector files and {:>3} others".format( indent, basename, len(train_ids), n_detector, n_other )) def main(argv=None): ap = argparse.ArgumentParser( prog='lsxfel', description="Summarise XFEL data in files or folders" ) ap.add_argument('paths', nargs='*', help="Files/folders to look at") ap.add_argument('--detail', action='append', default=[], help="Show details on keys & data for specified sources. " "This can slow down lsxfel considerably. " "Wildcard patterns like '*/XGM/*' are allowed, though you may " "need single quotes to prevent the shell processing them. " "Can be used more than once to include several patterns. " "Only used when inspecting a single run or file." ) args = ap.parse_args(argv) paths = args.paths or [os.path.abspath(os.getcwd())] if len(paths) == 1: path = paths[0] basename = os.path.basename(os.path.abspath(path.rstrip('/'))) if os.path.isdir(path): contents = sorted(os.listdir(path)) if any(f.endswith('.h5') for f in contents): # Run directory describe_run(path, args.detail) elif any(re.match(r'r\d+', f) for f in contents): # Proposal directory, containing runs print(basename, ": Proposal data directory") print() for f in contents: child_path = os.path.join(path, f) if re.match(r'r\d+', f) and os.path.isdir(child_path): summarise_run(child_path, indent=' ') elif osp.isdir(osp.join(path, 'raw')): print(basename, ": Proposal directory") print() print('{}/raw/'.format(basename)) for f in sorted(os.listdir(osp.join(path, 'raw'))): child_path = os.path.join(path, 'raw', f) if re.match(r'r\d+', f) and os.path.isdir(child_path): summarise_run(child_path, indent=' ') else: print(basename, ": Unrecognised directory") elif os.path.isfile(path): if path.endswith('.h5'): describe_file(path, args.detail) else: print(basename, ": Unrecognised file") return 2 else: print(path, ': File/folder not found') return 2 else: exit_code = 0 for path in paths: basename = os.path.basename(path) if os.path.isdir(path): contents = os.listdir(path) if any(f.endswith('.h5') for f in contents): # Run directory summarise_run(path) elif any(re.match(r'r\d+', f) for f in contents): # Proposal directory, containing runs print(basename, ": Proposal directory") print() for f in contents: child_path = os.path.join(path, f) if re.match(r'r\d+', f) and os.path.isdir(child_path): summarise_run(child_path, indent=' ') else: print(basename, ": Unrecognised directory") exit_code = 2 elif os.path.isfile(path): if path.endswith('.h5'): summarise_file(path) else: print(basename, ": Unrecognised file") exit_code = 2 else: print(path, ': File/folder not found') exit_code = 2 return exit_code if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/read_machinery.py0000644000175100001660000002232114757376472020777 0ustar00runnerdocker"""Machinery for reading Karabo HDF5 files The public API is in extra_data.reader; this is internal code. """ import logging import math import operator import os import os.path as osp import re import time from glob import iglob from numbers import Integral from warnings import warn import numpy as np log = logging.getLogger(__name__) DETECTOR_NAMES = {'AGIPD', 'DSSC', 'LPD'} DETECTOR_SOURCE_RE = re.compile(r'(.+\/(?:DET|CORR))\/(\d+)CH') DATA_ROOT_DIR = os.environ.get('EXTRA_DATA_DATA_ROOT', '/gpfs/exfel/exp') class _SliceConstructor(type): """Allows instantiation like subclass[1:5] """ def __getitem__(self, item): return self(item) class _SliceConstructable(metaclass=_SliceConstructor): def __init__(self, value): self.value = value def __repr__(self): indices = self.value if not isinstance(indices, tuple): indices = (indices,) return "{}[{}]".format( type(self).__name__, ', '.join(self._indexing_repr(v) for v in indices) ) @staticmethod def _indexing_repr(value): """Represent values as used in canonical slicing syntax""" if value is Ellipsis: return '...' elif isinstance(value, slice): start = value.start if (value.start is not None) else '' stop = value.stop if (value.stop is not None) else '' step = ':{}'.format(value.step) if (value.step is not None) else '' return '{}:{}{}'.format(start, stop, step) return repr(value) class by_id(_SliceConstructable): pass class by_index(_SliceConstructable): pass def _tid_to_slice_ix(tid, train_ids, stop=False): """Convert a train ID to an integer index for slicing the dataset Throws ValueError if the slice won't overlap the trains in the data. The *stop* parameter tells it which end of the slice it is making. """ if tid is None: return None try: return train_ids.index(tid) except ValueError: pass if len(train_ids) == 0: warn("Using train ID slice on data with no trains selected", stacklevel=4) return 0 if tid < train_ids[0]: if stop: warn( f"Train ID {tid} is before this run (starts at {train_ids[0]})", stacklevel=4, ) return 0 else: return None elif tid > train_ids[-1]: if stop: return None else: warn( f"Train ID {tid} is after this run (ends at {train_ids[-1]})", stacklevel=4, ) return len(train_ids) else: # This train ID is within the run, but doesn't have an entry. # Find the first ID in the run greater than the one given. return (train_ids > tid).nonzero()[0][0] def is_int_like(x): if isinstance(x, np.ndarray): return x.ndim == 0 and np.issubdtype(x.dtype, np.integer) return isinstance(x, Integral) def select_train_ids(train_ids, sel): if isinstance(sel, by_index): sel = sel.value if isinstance(sel, by_id): if isinstance(sel.value, slice): # Slice by train IDs start_ix = _tid_to_slice_ix(sel.value.start, train_ids, stop=False) stop_ix = _tid_to_slice_ix(sel.value.stop, train_ids, stop=True) return train_ids[start_ix: stop_ix: sel.value.step] if is_int_like(sel.value): sel.value = [operator.index(sel.value)] if isinstance(sel.value, (list, np.ndarray)): # Select a list of trains by train ID new_train_ids = sorted(set(train_ids).intersection(sel.value)) if len(sel.value) and not new_train_ids: warn(f"Given train IDs not found among {len(train_ids)} trains" " in collection", stacklevel=3,) return new_train_ids else: raise TypeError(type(sel.value)) elif isinstance(sel, slice): # Slice by indexes in this collection return train_ids[sel] elif is_int_like(sel): return [train_ids[operator.index(sel)]] elif isinstance(sel, (list, np.ndarray)): # Select a list of trains by index in this collection return sorted(np.asarray(train_ids)[sel]) else: raise TypeError(type(sel)) def split_trains(n_trains, parts=None, trains_per_part=None) -> [slice]: if trains_per_part is not None: assert trains_per_part >= 1 n_parts = math.ceil(n_trains / trains_per_part) if parts is not None: n_parts = max(n_parts, min(parts, n_trains)) elif parts is not None: assert parts >= 1 n_parts = min(parts, n_trains) else: raise ValueError("Either parts or trains_per_part must be specified") return [ slice(i * n_trains // n_parts, (i + 1) * n_trains // n_parts) for i in range(n_parts) ] def trains_files_index(train_ids, files, inc_suspect_trains=True) -> list: """Make a list of which FileAccess contains each train, used in splitting""" tids_files = [None] * len(train_ids) tid_to_ix = {t: i for i, t in enumerate(train_ids)} for file in files: f_tids = file.train_ids if inc_suspect_trains else file.valid_train_ids for tid in f_tids: ix = tid_to_ix.get(tid, None) if ix is not None: tids_files[ix] = file return tids_files class DataChunk: """Reference to a contiguous chunk of data for one or more trains.""" def __init__(self, file, dataset_path, first, train_ids, counts): self.file = file self.dataset_path = dataset_path self.first = first self.train_ids = train_ids self.counts = counts @property def slice(self): return slice(self.first, self.first + np.sum(self.counts)) @property def total_count(self): return int(np.sum(self.counts, dtype=np.uint64)) @property def dataset(self): return self.file.file[self.dataset_path] # contiguous_regions() by Joe Kington on Stackoverflow # https://stackoverflow.com/a/4495197/434217 # Used here under Stackoverflow's default CC-BY-SA 3.0 license. def contiguous_regions(condition): """Finds contiguous True regions of the boolean array "condition". Returns a 2D array where the first column is the start index of the region and the second column is the end index.""" # Find the indices of changes in "condition" d = np.diff(condition) idx, = d.nonzero() # We need to start things after the change in "condition". Therefore, # we'll shift the index by 1 to the right. idx += 1 if condition[0]: # If the start of condition is True prepend a 0 idx = np.r_[0, idx] if condition[-1]: # If the end of condition is True, append the length of the array idx = np.r_[idx, condition.size] # Edit # Reshape the result into two columns idx.shape = (-1,2) return idx def roi_shape(orig_shape: tuple, roi: tuple) -> tuple: """Find array shape after slicing ROI""" dummy = np.zeros((0,) + orig_shape) # Extra 0 dim -> minimal memory use return dummy[np.index_exp[:] + roi].shape[1:] class FilenameInfo: is_detector = False detector_name = None detector_moduleno = -1 _rawcorr_descr = {'RAW': 'Raw', 'CORR': 'Corrected'} def __init__(self, path): self.basename = osp.basename(path) nameparts = self.basename[:-3].split('-') assert len(nameparts) == 4, self.basename rawcorr, runno, datasrc, segment = nameparts m = re.match(r'([A-Z]+)(\d+)', datasrc) if m and m.group(1) == 'DA': self.description = "Aggregated data" elif m and m.group(1) in DETECTOR_NAMES: self.is_detector = True name, moduleno = m.groups() self.detector_name = name self.detector_moduleno = moduleno self.description = "{} detector data from {} module {}".format( self._rawcorr_descr.get(rawcorr, '?'), name, moduleno ) else: self.description = "Unknown data source ({})", datasrc def find_proposal(propno): """Find the proposal directory for a given proposal on Maxwell""" if '/' in propno: # Already passed a proposal directory return propno t0 = time.monotonic() for d in iglob(osp.join(DATA_ROOT_DIR, '*/*/{}'.format(propno))): dt = time.monotonic() - t0 log.info("Found proposal dir %r in %.2g s", d, dt) return d raise Exception("Couldn't find proposal dir for {!r}".format(propno)) def same_run(*args) -> bool: """return True if arguments objects contain data from the same RUN arguments can be of type *DataCollection* or *SourceData* """ # DataCollection union of format version = 0.5 (no run/proposal # in # files) is not considered a single run. proposal_nos = set() run_nos = set() for dc in args: md = dc.run_metadata() if dc.is_single_run else {} proposal_nos.add(md.get("proposalNumber", -1)) run_nos.add(md.get("runNumber", -1)) return (len(proposal_nos) == 1 and (-1 not in proposal_nos) and len(run_nos) == 1 and (-1 not in run_nos)) glob_wildcards_re = re.compile(r'([*?[])') ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/reader.py0000644000175100001660000023613314757376472017277 0ustar00runnerdocker# coding: utf-8 """ Collection of classes and functions to help reading HDF5 file generated at The European XFEL. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ import datetime import fnmatch import logging import os import os.path as osp import re import signal import sys import tempfile import time from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence from itertools import groupby from multiprocessing import Pool from operator import index from pathlib import Path from typing import Tuple from warnings import warn import h5py import numpy as np from . import locality, voview from .aliases import AliasIndexer from .exceptions import (MultiRunError, PropertyNameError, SourceNameError, TrainIDError) from .file_access import FileAccess from .keydata import KeyData from .read_machinery import (DETECTOR_SOURCE_RE, by_id, by_index, find_proposal, glob_wildcards_re, is_int_like, same_run, select_train_ids) from .run_files_map import RunFilesMap from .sourcedata import SourceData from .utils import available_cpu_cores __all__ = [ 'H5File', 'RunDirectory', 'open_run', 'FileAccess', 'DataCollection', 'by_id', 'by_index', 'SourceNameError', 'PropertyNameError', ] log = logging.getLogger(__name__) RUN_DATA = 'RUN' INDEX_DATA = 'INDEX' METADATA = 'METADATA' def ignore_sigint(): # Used in child processes to prevent them from receiving KeyboardInterrupt signal.signal(signal.SIGINT, signal.SIG_IGN) class DataCollection: """An assemblage of data generated at European XFEL Data consists of *sources* which each have *keys*. It is further organised by *trains*, which are identified by train IDs. You normally get an instance of this class by calling :func:`H5File` for a single file or :func:`RunDirectory` for a directory. """ def __init__( self, files, sources_data=None, train_ids=None, aliases=None, ctx_closes=False, *, inc_suspect_trains=True, is_single_run=False, ): self.files = list(files) self.ctx_closes = ctx_closes self.inc_suspect_trains = inc_suspect_trains self.is_single_run = is_single_run if train_ids is None: if inc_suspect_trains: tid_sets = [f.train_ids for f in files] else: tid_sets = [f.valid_train_ids for f in files] train_ids = sorted(set().union(*tid_sets)) self.train_ids = train_ids if sources_data is None: files_by_sources = defaultdict(list) legacy_sources = dict() for f in self.files: for source in f.control_sources: files_by_sources[source, 'CONTROL'].append(f) for source in f.instrument_sources: files_by_sources[source, 'INSTRUMENT'].append(f) legacy_sources.update(f.legacy_sources) sources_data = { src: SourceData(src, sel_keys=None, train_ids=train_ids, files=files, section=section, canonical_name=legacy_sources.get(src, src), is_single_run=self.is_single_run, inc_suspect_trains=self.inc_suspect_trains ) for ((src, section), files) in files_by_sources.items() } self._sources_data = sources_data self._aliases = aliases or {} # Throw an error if we have conflicting data for the same source self._check_source_conflicts() self.control_sources = frozenset({ name for (name, sd) in self._sources_data.items() if sd.section == 'CONTROL' }) self.instrument_sources = frozenset({ name for (name, sd) in self._sources_data.items() if sd.section == 'INSTRUMENT' }) self.legacy_sources = { name: sd.canonical_name for (name, sd) in self._sources_data.items() if sd.is_legacy } @staticmethod def _open_file(path, cache_info=None): try: fa = FileAccess(path, _cache_info=cache_info) except Exception as e: return osp.basename(path), str(e) else: return osp.basename(path), fa @classmethod def from_paths( cls, paths, _files_map=None, *, inc_suspect_trains=True, is_single_run=False, parallelize=True ): files = [] uncached = [] def handle_open_file_attempt(fname, fa): if isinstance(fa, FileAccess): files.append(fa) else: print(f"Skipping file {fname}", file=sys.stderr) print(f" (error was: {fa})", file=sys.stderr) for path in paths: cache_info = _files_map and _files_map.get(path) if cache_info and ('flag' in cache_info): filename, fa = cls._open_file(path, cache_info=cache_info) handle_open_file_attempt(filename, fa) else: uncached.append(path) if uncached: # Open the files either in parallel or serially if parallelize: nproc = min(available_cpu_cores(), len(uncached)) with Pool(processes=nproc, initializer=ignore_sigint) as pool: for fname, fa in pool.imap_unordered(cls._open_file, uncached): handle_open_file_attempt(fname, fa) else: for path in uncached: handle_open_file_attempt(*cls._open_file(path)) if not files: raise Exception("All HDF5 files specified are unusable") return cls( files, ctx_closes=True, inc_suspect_trains=inc_suspect_trains, is_single_run=is_single_run, ) @classmethod def from_path(cls, path, *, inc_suspect_trains=True): files = [FileAccess(path)] return cls( files, ctx_closes=True, inc_suspect_trains=inc_suspect_trains, is_single_run=True ) def __enter__(self): if not self.ctx_closes: raise Exception( "Only DataCollection objects created by opening " "files directly can be used in a 'with' statement, " "not those created by selecting from or merging " "others." ) return self def __exit__(self, exc_type, exc_val, exc_tb): # Close the files if this collection was created by opening them. if self.ctx_closes: for file in self.files: file.close() @property def selection(self): # This was previously a regular attribute, which code may have relied on. return {src: srcdata.sel_keys for src, srcdata in self._sources_data.items()} @property def _source_index(self): warn( "DataCollection._source_index will be removed. " "Contact da-support@xfel.eu if you need to discuss alternatives.", stacklevel=2 ) return {src: srcdata.files for src, srcdata in self._sources_data.items()} @property def all_sources(self): return self.control_sources | self.instrument_sources @property def detector_sources(self): return set(filter(DETECTOR_SOURCE_RE.match, self.instrument_sources)) \ - self.legacy_sources.keys() def _check_field(self, source, key): if source not in self.all_sources: raise SourceNameError(source) if key not in self[source]: raise PropertyNameError(key, source) def keys_for_source(self, source): """Get a set of key names for the given source If you have used :meth:`select` to filter keys, only selected keys are returned. Only one file is used to find the keys. Within a run, all files should have the same keys for a given source, but if you use :meth:`union` to combine two runs where the source was configured differently, the result can be unpredictable. """ return self._get_source_data(source).keys() # Leave old name in case anything external was using it: _keys_for_source = keys_for_source def _get_key_data(self, source, key): return self._get_source_data(source)[key] def _get_source_data(self, source): if source not in self._sources_data: raise SourceNameError(source) sd = self._sources_data[source] if sd.is_legacy: warn(f"{source} is a legacy name for {self.legacy_sources[source]}. " f"Access via this name will be removed for future data.", DeprecationWarning, stacklevel=3) return self._sources_data[source] def __getitem__(self, item): if ( isinstance(item, tuple) and len(item) == 2 and all(isinstance(e, str) for e in item) ): return self._get_key_data(*item) elif isinstance(item, str): return self._get_source_data(item) elif ( isinstance(item, (by_id, by_index, list, np.ndarray, slice)) or is_int_like(item) ): return self.select_trains(item) raise TypeError("Expected data[source], data[source, key] or data[train_selection]") def __contains__(self, item): if ( isinstance(item, tuple) and len(item) == 2 and all(isinstance(e, str) for e in item) ): return item[0] in self.all_sources and \ item[1] in self._get_source_data(item[0]) elif isinstance(item, str): return item in self.all_sources return False __iter__ = None # Disable iteration def _ipython_key_completions_(self): return list(self.all_sources) def get_entry_shape(self, source, key): """Get the shape of a single data entry for the given source & key""" return self._get_key_data(source, key).entry_shape def get_dtype(self, source, key): """Get the numpy data type for the given source & key""" return self._get_key_data(source, key).dtype def _check_data_missing(self, tid) -> bool: """Return True if a train does not have data for all sources""" for source in self.control_sources: file, _ = self._find_data(source, tid) if file is None: return True # No need to evaluate this for legacy sources as well. for source in self.instrument_sources - self.legacy_sources.keys(): file, pos = self._find_data(source, tid) if file is None: return True groups = {k.partition('.')[0] for k in self.keys_for_source(source)} for group in groups: _, counts = file.get_index(source, group) if counts[pos] == 0: return True return False def trains(self, devices=None, train_range=None, *, require_all=False, flat_keys=False, keep_dims=False): """Iterate over all trains in the data and gather all sources. :: run = Run('/path/to/my/run/r0123') for train_id, data in run.select("*/DET/*", "image.data").trains(): mod0 = data["FXE_DET_LPD1M-1/DET/0CH0:xtdf"]["image.data"] Parameters ---------- devices: dict or list, optional Filter data by sources and keys. Refer to :meth:`select` for how to use this. train_range: by_id or by_index object, optional Iterate over only selected trains, by train ID or by index. Refer to :meth:`select_trains` for how to use this. require_all: bool False (default) returns any data available for the requested trains. True skips trains which don't have all the selected data; this only makes sense if you make a selection with *devices* or :meth:`select`. flat_keys: bool False (default) returns nested dictionaries in each iteration indexed by source and then key. True returns a flat dictionary indexed by (source, key) tuples. keep_dims: bool False (default) drops the first dimension when there is a single entry. True preserves this dimension. Yields ------ tid : int The train ID of the returned train data : dict The data for this train, keyed by device name """ dc = self if devices is not None: dc = dc.select(devices) if train_range is not None: dc = dc.select_trains(train_range) return iter(TrainIterator(dc, require_all=require_all, flat_keys=flat_keys, keep_dims=keep_dims)) def train_from_id( self, train_id, devices=None, *, flat_keys=False, keep_dims=False): """Get train data for specified train ID. Parameters ---------- train_id: int The train ID devices: dict or list, optional Filter data by sources and keys. Refer to :meth:`select` for how to use this. flat_keys: bool False (default) returns a nested dict indexed by source and then key. True returns a flat dictionary indexed by (source, key) tuples. keep_dims: bool False (default) drops the first dimension when there is a single entry. True preserves this dimension. Returns ------- tid : int The train ID of the returned train data : dict The data for this train, keyed by device name Raises ------ KeyError if `train_id` is not found in the run. """ if train_id not in self.train_ids: raise TrainIDError(train_id) if devices is not None: return self.select(devices).train_from_id(train_id) res = {} for source in self.control_sources: source_data = res[source] = { 'metadata': {'source': source, 'timestamp.tid': train_id} } file, pos = self._find_data(source, train_id) if file is None: continue firsts, counts = file.get_index(source, '') first, count = firsts[pos], counts[pos] if not count: continue for key in self.keys_for_source(source): path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/')) source_data[key] = file.file[path][first] for source in self.instrument_sources: source_data = res[source] = { 'metadata': {'source': source, 'timestamp.tid': train_id} } file, pos = self._find_data(source, train_id) if file is None: continue for key in self.keys_for_source(source): group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] if not count: continue path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) if count == 1 and not keep_dims: source_data[key] = file.file[path][first] else: source_data[key] = file.file[path][first : first + count] if flat_keys: # {src: {key: data}} -> {(src, key): data} res = {(src, key): v for src, source_data in res.items() for (key, v) in source_data.items()} return train_id, res def train_from_index( self, train_index, devices=None, *, flat_keys=False, keep_dims=False): """Get train data of the nth train in this data. Parameters ---------- train_index: int Index of the train in the file. devices: dict or list, optional Filter data by sources and keys. Refer to :meth:`select` for how to use this. flat_keys: bool False (default) returns a nested dict indexed by source and then key. True returns a flat dictionary indexed by (source, key) tuples. keep_dims: bool False (default) drops the first dimension when there is a single entry. True preserves this dimension. Returns ------- tid : int The train ID of the returned train data : dict The data for this train, keyed by device name """ train_id = self.train_ids[train_index] return self.train_from_id( int(train_id), devices=devices, flat_keys=flat_keys, keep_dims=keep_dims) def get_data_counts(self, source, key): """Get a count of data points in each train for the given data field. Returns a pandas series with an index of train IDs. Parameters ---------- source: str Source name, e.g. "SPB_DET_AGIPD1M-1/DET/7CH0:xtdf" key: str Key of parameter within that device, e.g. "image.data". """ return self._get_key_data(source, key).data_counts() def get_series(self, source, key): """Return a pandas Series for a 1D data field defined by source & key. See :meth:`.KeyData.series` for details. """ return self._get_key_data(source, key).series() def get_dataframe(self, fields=None, *, timestamps=False): """Return a pandas dataframe for given data fields. :: df = run.get_dataframe(fields=[ ("*_XGM/*", "*.i[xy]Pos"), ("*_XGM/*", "*.photonFlux") ]) This links together multiple 1-dimensional datasets as columns in a table. Parameters ---------- fields : dict or list, optional Select data sources and keys to include in the dataframe. Selections are defined by lists or dicts as in :meth:`select`. timestamps : bool If false (the default), exclude the timestamps associated with each control data field. """ import pandas as pd if fields is not None: return self.select(fields).get_dataframe(timestamps=timestamps) series = [] for source in self.all_sources: for key in self.keys_for_source(source): if (not timestamps) and key.endswith('.timestamp'): continue series.append(self.get_series(source, key)) return pd.concat(series, axis=1) def get_array(self, source, key, extra_dims=None, roi=(), name=None): """Return a labelled array for a data field defined by source and key. see :meth:`.KeyData.xarray` for details. """ if isinstance(roi, by_index): roi = roi.value return self._get_key_data(source, key).xarray( extra_dims=extra_dims, roi=roi, name=name) def get_dask_array(self, source, key, labelled=False): """Get a Dask array for a data field defined by source and key. see :meth:`.KeyData.dask_array` for details. """ return self._get_key_data(source, key).dask_array(labelled=labelled) def get_run_value(self, source, key): """Get a single value from the RUN section of data files. RUN records each property of control devices as a snapshot at the beginning of the run. This includes properties which are not saved continuously in CONTROL data. This method is intended for use with data from a single run. If you combine data from multiple runs, it will raise MultiRunError. Parameters ---------- source: str Control device name, e.g. "HED_OPT_PAM/CAM/SAMPLE_CAM_4". key: str Key of parameter within that device, e.g. "triggerMode". """ return self._get_source_data(source).run_value(key) def get_run_values(self, source) -> dict: """Get a dict of all RUN values for the given source This includes keys which are also in CONTROL. Parameters ---------- source: str Control device name, e.g. "HED_OPT_PAM/CAM/SAMPLE_CAM_4". """ return self._get_source_data(source).run_values() def _merge_aliases(self, alias_dicts): """Merge multiple alias dictionaries and check for conflicts.""" new_aliases = {} for aliases in alias_dicts: for alias, literal in aliases.items(): alias = alias.lower().replace('_', '-') if new_aliases.setdefault(alias, literal) != literal: raise ValueError(f'conflicting alias definition ' f'for {alias} (or {alias.upper()}, ' f'{alias.replace("-", "_")}, etc.)') return new_aliases def union(self, *others): """Join the data in this collection with one or more others. This can be used to join multiple sources for the same trains, or to extend the same sources with data for further trains. The order of the datasets doesn't matter. Any aliases defined on the collections are combined as well unless their values conflict. Note that the trains for each source are unioned as well, such that ``run.train_ids == run[src].train_ids``. Returns a new :class:`DataCollection` object. """ sources_data_multi = defaultdict(list) for dc in (self,) + others: for source, srcdata in dc._sources_data.items(): sources_data_multi[source].append(srcdata) sources_data = {src: src_datas[0].union(*src_datas[1:]) for src, src_datas in sources_data_multi.items()} aliases = self._merge_aliases( [self._aliases] + [dc._aliases for dc in others]) train_ids = sorted(set().union(*[sd.train_ids for sd in sources_data.values()])) # Update the internal list of train IDs for the sources for sd in sources_data.values(): sd.train_ids = train_ids files = set().union(*[sd.files for sd in sources_data.values()]) return DataCollection( files, sources_data=sources_data, train_ids=train_ids, aliases=aliases, inc_suspect_trains=self.inc_suspect_trains, is_single_run=same_run(self, *others), ) def __or__(self, other): return self.union(other) def __ior__(self, other): return self.union(other) def _parse_aliases(self, alias_defs): """Parse alias definitions into alias dictionaries.""" alias_dicts = [] def is_valid_alias(k, v): return (isinstance(k, str) and ( isinstance(v, str) or (isinstance(v, tuple) and len(v) == 2) )) for alias_def in alias_defs: if isinstance(alias_def, Mapping): if any([not is_valid_alias(k, v) for k, v in alias_def.items()]): raise ValueError('alias definition by dict must be all ' 'str keys to str values for sources or ' '2-len tuples for sourcekeys') alias_dicts.append(alias_def) elif isinstance(alias_def, (str, os.PathLike)): # From a file. alias_dicts.append( self._load_aliases_from_file(Path(alias_def))) return alias_dicts def _load_aliases_from_file(self, aliases_path): """Load alias definitions from file.""" if aliases_path.suffix == '.json': import json with open(aliases_path, 'r') as f: data = json.load(f) elif aliases_path.suffix in ['.yaml', '.yml']: import yaml with open(aliases_path, 'r') as f: data = yaml.safe_load(f) elif aliases_path.suffix == '.toml': try: from tomli import load as load_toml except ImportError: # Try the built-in tomllib for 3.11+. from tomllib import load as load_toml with open(aliases_path, 'rb') as f: data = load_toml(f) aliases = {} def walk_dict_value(source, key_aliases): for alias, key in key_aliases.items(): aliases[alias] = (source, key) for key, value in data.items(): if isinstance(value, str): # Source alias. aliases[key] = value elif isinstance(value, list) and len(value) == 2: # Sourcekey alias by explicit list. aliases[key] = tuple((str(x) for x in value)) elif isinstance(value, dict): # Sourcekey alias by nested mapping. walk_dict_value(key, value) else: raise ValueError(f"unsupported literal type for alias '{key}'") return aliases def with_aliases(self, *alias_defs): """Apply aliases for convenient source and key access. Allows to define aliases for sources or source-key combinations that may be used instead of their literal names to retrieve :class:`SourceData` and :class:`KeyData` objects via :attr:`.DataCollection.alias`. Multiple alias definitions may be passed as positional arguments in different formats: 1. Passing a dictionary mapping aliases to sources (passed as strings) or source-key pairs (passed as a 2-len tuple of strings). 2. Passing a string or PathLike pointing to a JSON, YAML (requires pyYAML installed) or TOML (requires Python 3.11 or with tomli installed) file containing the aliases. For unsupported formats, an :class:`ImportError` is raised. The file should contain mappings from alias to sources as strings or source-key pairs as lists. In addition, source-key aliases may be defined by nested key-value pairs according to the respective format, shown here in YAML: .. code-block:: yaml # Source alias. sa1-xgm: SA1_XTD2_XGM/XGM/DOOCS # Direct source key alias. sa1-intensity: [SA1_XTD2_XGM/XGM/DOOCS:output, data.intensityTD] # Nested source key alias, useful if you want aliases for multiple # keys of the same source. SA3_XTD10_MONO/MDL/PHOTON_ENERGY: mono-central-energy: actualEnergy Returns a new :class:`DataCollection` object with the aliases for sources and keys. """ # Check for conflicts within these definitions new_aliases = self._merge_aliases( [self._aliases] + self._parse_aliases(alias_defs)) return DataCollection( self.files, sources_data=self._sources_data, train_ids=self.train_ids, aliases=new_aliases, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run ) def only_aliases(self, *alias_defs, strict=False, require_all=False): """Apply aliases and select only the aliased sources and keys. A convenient function around :meth:`DataCollection.with_aliases` and :meth:`DataCollection.select` applying both the aliases passed as ``alias_defs`` to the former and then selecting down the :class:`DataCollection` to any sources and/or their keys for which aliases exist. By default and unlike :meth:`DataCollection.select`, any sources or keys present in the alias definitions but not the data itself are ignored. This can be changed via the optional argument ``strict``. The optional ``require_all`` argument restricts the trains to those for which all selected sources and keys have at least one data entry. By default, all trains remain selected. Returns a new :class:`DataCollection` object with only the aliased sources and keys. """ # Create new aliases. aliases = self._merge_aliases( [self._aliases] + self._parse_aliases(alias_defs)) # Set of sources aliased. aliased_sources = {literal for literal in aliases.values() if isinstance(literal, str)} # In the current implementation of DataCollection.select(), any # occurence of a wildcard glob will include all keys for a given # source, even if specific keys are listed as well. To be safe, # the source aliases are picked first and no specific sourcekey # aliases for the same source are included in the selection. # Entire source selections. selection = [(source, '*') for source in aliased_sources] # Specific key selections. selection += [ literal for literal in aliases.values() if isinstance(literal, tuple) \ and literal[0] not in aliased_sources ] if not strict: # If strict mode is disabled, any non-existing sources or # keys are stripped out. existing_sel_idx = [] for sel_idx, (source, key) in enumerate(selection): try: sd = self[source] except SourceNameError: # Source not present. continue else: if key != '*' and key not in sd: # Source present, but not key. continue existing_sel_idx.append(sel_idx) selection = [selection[sel_idx] for sel_idx in existing_sel_idx] # Create a new DataCollection from selecting and add the aliases. new_data = self.select(selection, require_all=require_all) new_data._aliases = aliases return new_data def drop_aliases(self): """Return a new DataCollection without any aliases.""" return DataCollection( self.files, sources_data=self._sources_data, train_ids=self.train_ids, aliases={}, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run ) @property def alias(self): """Enables item access via source and key aliases.""" return AliasIndexer(self) def _expand_selection(self, selection): if isinstance(selection, dict): # {source: {key1, key2}} # {source: set()} or {source: None} -> all keys for this source res = {} for source, in_keys in selection.items(): if source not in self.all_sources: raise SourceNameError(source) # Empty dict was accidentally allowed and tested; keep it # working just in case. if in_keys == {}: in_keys = set() if in_keys is not None and not isinstance(in_keys, set): raise TypeError( f"keys in selection dict should be a set or None (got " f"{in_keys!r})" ) res[source] = self._sources_data[source].select_keys(in_keys) return res elif isinstance(selection, Iterable): # selection = [('src_glob', 'key_glob'), ...] # OR ['src_glob', 'src_glob', ...] sources_data_multi = defaultdict(list) for globs in selection: if isinstance(globs, str): src_glob = globs key_glob = '*' else: src_glob, key_glob = globs for source, keys in self._select_glob(src_glob, key_glob).items(): sources_data_multi[source].append( self._sources_data[source].select_keys(keys) ) return {src: src_datas[0].union(*src_datas[1:]) for src, src_datas in sources_data_multi.items()} elif isinstance(selection, DataCollection): return self._expand_selection(selection.selection) elif isinstance(selection, SourceData): return {selection.source: selection} elif isinstance(selection, KeyData): src = selection.source return {src: self._sources_data[src].select_keys({selection.key})} else: raise TypeError("Unknown selection type: {}".format(type(selection))) def _select_glob(self, source_glob, key_glob): source_re = re.compile(fnmatch.translate(source_glob)) key_re = re.compile(fnmatch.translate(key_glob)) if key_glob.endswith(('.value', '*')): ctrl_key_glob = key_glob ctrl_key_re = key_re else: # Add .value suffix for keys of CONTROL sources ctrl_key_glob = key_glob + '.value' ctrl_key_re = re.compile(fnmatch.translate(ctrl_key_glob)) matched = {} for source in self.all_sources: if not source_re.match(source): continue if key_glob == '*': # When the selection refers to all keys, make sure this # is restricted to the current selection of keys for # this source. if self.selection[source] is None: matched[source] = None else: matched[source] = self.selection[source] elif glob_wildcards_re.search(key_glob) is None: # Selecting a single key (no wildcards in pattern) # This check should be faster than scanning all keys: k = ctrl_key_glob if source in self.control_sources else key_glob if k in self._sources_data[source]: matched[source] = {k} else: r = ctrl_key_re if source in self.control_sources else key_re keys = set(filter(r.match, self.keys_for_source(source))) if keys: matched[source] = keys if not matched: raise ValueError("No matches for pattern {}" .format((source_glob, key_glob))) return matched def select(self, seln_or_source_glob, key_glob='*', require_all=False, require_any=False, *, warn_drop_trains_frac=1.): """Select a subset of sources and keys from this data. There are four possible ways to select data: 1. With two glob patterns (see below) for source and key names:: # Select data in the image group for any detector sources sel = run.select('*/DET/*', 'image.*') 2. With an iterable of source glob patterns, or (source, key) patterns:: # Select image.data and image.mask for any detector sources sel = run.select([('*/DET/*', 'image.data'), ('*/DET/*', 'image.mask')]) # Select & align undulator & XGM devices sel = run.select(['*XGM/*', 'MID_XTD1_UND/DOOCS/ENERGY'], require_all=True) Data is included if it matches any of the pattern pairs. 3. With a dict of source names mapped to sets of key names (or empty sets to get all keys):: # Select image.data from one detector source, and all data from one XGM sel = run.select({'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf': {'image.data'}, 'SA1_XTD2_XGM/XGM/DOOCS': set()}) Unlike the others, this option *doesn't* allow glob patterns. It's a more precise but less convenient option for code that knows exactly what sources and keys it needs. 4. With an existing DataCollection, SourceData or KeyData object:: # Select the same data contained in another DataCollection prev_run.select(sel) The optional `require_all` and `require_any` arguments restrict the trains to those for which all or at least one selected sources and keys have at least one data entry. By default, all trains remain selected. With `require_all=True`, a warning will be shown if there are no trains with all the required data. Setting `warn_drop_trains_frac` can show the same warning if there are a few remaining trains. This is a number 0-1 representing the fraction of trains dropped for one source (default 1). Returns a new :class:`DataCollection` object for the selected data. .. note:: 'Glob' patterns may be familiar from selecting files in a Unix shell. ``*`` matches anything, so ``*/DET/*`` selects sources with "/DET/" anywhere in the name. There are several kinds of wildcard: - ``*``: anything - ``?``: any single character - ``[xyz]``: one character, "x", "y" or "z" - ``[0-9]``: one digit character - ``[!xyz]``: one character, *not* x, y or z Anything else in the pattern must match exactly. It's case-sensitive, so "x" does not match "X". """ if isinstance(seln_or_source_glob, str): seln_or_source_glob = [(seln_or_source_glob, key_glob)] sources_data = self._expand_selection(seln_or_source_glob) if require_all or require_any: # Select only those trains for which all (require_all) or at # least one (require_any) selected sources and keys have # data, i.e. have a count > 0 in their respective INDEX # section. if require_all: train_ids = self.train_ids else: # require_any # Empty list would be converted to np.float64 array. train_ids = np.empty(0, dtype=np.uint64) for source, srcdata in sources_data.items(): n_trains_prev = len(train_ids) for group in srcdata.index_groups: source_tids = np.empty(0, dtype=np.uint64) for f in self._sources_data[source].files: valid = True if self.inc_suspect_trains else f.validity_flag # Add the trains with data in each file. _, counts = f.get_index(source, group) source_tids = np.union1d( f.train_ids[valid & (counts > 0)], source_tids ) # Remove any trains previously selected, for which this # selected source and key group has no data. if require_all: train_ids = np.intersect1d(train_ids, source_tids) else: # require_any train_ids = np.union1d(train_ids, source_tids) n_drop = n_trains_prev - len(train_ids) if n_trains_prev and (n_drop / n_trains_prev) >= warn_drop_trains_frac: warn(f"{n_drop}/{n_trains_prev} ({n_drop / n_trains_prev :.0%})" f" trains dropped when filtering by {source}") train_ids = list(train_ids) # Convert back to a list. sources_data = { src: srcdata._only_tids(train_ids) for src, srcdata in sources_data.items() } else: train_ids = self.train_ids files = set().union(*[sd.files for sd in sources_data.values()]) return DataCollection( files, sources_data, train_ids=train_ids, aliases=self._aliases, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run ) def deselect(self, seln_or_source_glob, key_glob='*'): """Select everything except the specified sources and keys. This takes the same arguments as :meth:`select`, but the sources and keys you specify are dropped from the selection. Returns a new :class:`DataCollection` object for the remaining data. """ if isinstance(seln_or_source_glob, str): seln_or_source_glob = [(seln_or_source_glob, key_glob)] deselection = self._expand_selection(seln_or_source_glob) # Subtract deselection from selection on self sources_data = {} for source, srcdata in self._sources_data.items(): if source not in deselection: sources_data[source] = srcdata continue desel_keys = deselection[source].sel_keys if desel_keys is None: continue # Drop the entire source remaining_keys = srcdata.keys() - desel_keys if remaining_keys: sources_data[source] = srcdata.select_keys(remaining_keys) files = set().union(*[sd.files for sd in sources_data.values()]) return DataCollection( files, sources_data=sources_data, train_ids=self.train_ids, aliases=self._aliases, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run, ) def select_trains(self, train_range): """Select a subset of trains from this data. Slice trains by position within this data:: sel = run.select_trains(np.s_[:5]) Or select trains by train ID, with a slice or a list:: from extra_data import by_id sel1 = run.select_trains(by_id[142844490 : 142844495]) sel2 = run.select_trains(by_id[[142844490, 142844493, 142844494]]) Returns a new :class:`DataCollection` object for the selected trains. Raises ------ ValueError If given train IDs do not overlap with the trains in this data. """ new_train_ids = select_train_ids(self.train_ids, train_range) sources_data = { src: srcdata._only_tids(new_train_ids) for src, srcdata in self._sources_data.items() } files = set().union(*[sd.files for sd in sources_data.values()]) return DataCollection( files, sources_data=sources_data, train_ids=new_train_ids, aliases=self._aliases, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run, ) def split_trains(self, parts=None, trains_per_part=None): """Split this data into chunks with a fraction of the trains each. Either *parts* or *trains_per_part* must be specified. This returns an iterator yielding new :class:`DataCollection` objects. The parts will have similar sizes, e.g. splitting 11 trains with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3. Parameters ---------- parts: int How many parts to split the data into. If trains_per_part is also specified, this is a minimum, and it may make more parts. It may also make fewer if there are fewer trains in the data. trains_per_part: int A maximum number of trains in each part. Parts will often have fewer trains than this. """ for source in self._sources_data.values(): assert source.train_ids == self.train_ids def dict_zip(iter_d): while True: try: yield {k: next(v) for (k, v) in iter_d.items()} except StopIteration: return for sources_data_part in dict_zip({ n: s.split_trains(parts=parts, trains_per_part=trains_per_part) for (n, s) in self._sources_data.items() }): files = set().union(*[sd.files for sd in sources_data_part.values()]) train_ids = list(sources_data_part.values())[0].train_ids yield DataCollection( files, sources_data=sources_data_part, train_ids=train_ids, aliases=self._aliases, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run, ) def _check_source_conflicts(self): """Check for data with the same source and train ID in different files. """ sources_with_conflicts = set() files_conflict_cache = {} def files_have_conflict(files): fset = frozenset({f.filename for f in files}) if fset not in files_conflict_cache: if self.inc_suspect_trains: tids = np.concatenate([f.train_ids for f in files]) else: tids = np.concatenate([f.valid_train_ids for f in files]) files_conflict_cache[fset] = len(np.unique(tids)) != len(tids) return files_conflict_cache[fset] for source, srcdata in self._sources_data.items(): if files_have_conflict(srcdata.files): sources_with_conflicts.add(source) if sources_with_conflicts: raise ValueError("{} sources have conflicting data " "(same train ID in different files): {}".format( len(sources_with_conflicts), ", ".join(sources_with_conflicts) )) def _expand_trainids(self, counts, trainIds): n = min(len(counts), len(trainIds)) return np.repeat(trainIds[:n], counts.astype(np.intp)[:n]) def _find_data_chunks(self, source, key): """Find contiguous chunks of data for the given source & key Yields DataChunk objects. """ return self._get_key_data(source, key)._data_chunks def _find_data(self, source, train_id) -> Tuple[FileAccess, int]: for f in self._sources_data[source].files: ixs = (f.train_ids == train_id).nonzero()[0] if self.inc_suspect_trains and ixs.size > 0: return f, ixs[0] for ix in ixs: if f.validity_flag[ix]: return f, ix return None, None def __repr__(self): return f"" def info(self, details_for_sources=()): """Show information about the selected data. """ details_sources_re = [re.compile(fnmatch.translate(p)) for p in details_for_sources] # time info train_count = len(self.train_ids) if train_count == 0: first_train = last_train = '-' span_txt = '0.0' else: first_train = self.train_ids[0] last_train = self.train_ids[-1] seconds, deciseconds = divmod((last_train - first_train + 1), 10) try: td = datetime.timedelta(seconds=int(seconds)) except OverflowError: # Can occur if a train ID is corrupted span_txt = "OverflowError (one or more train IDs are probably wrong)" else: span_txt = f'{td}.{int(deciseconds)}' # disp print('# of trains: ', train_count) print('Duration: ', span_txt) print('First train ID:', first_train) print('Last train ID: ', last_train) print() if not details_for_sources: # Include summary section for multi-module detectors unless # source details are enabled. sources_by_detector = {} for source in self.detector_sources: name, modno = DETECTOR_SOURCE_RE.match(source).groups((1, 2)) sources_by_detector.setdefault(name, {})[modno] = source for detector_name in sorted(sources_by_detector.keys()): detector_modules = sources_by_detector[detector_name] print("{} XTDF detector modules of {}/*".format( len(detector_modules), detector_name )) if len(detector_modules) > 0: # Show detail on the first module (the others should be similar) mod_key = sorted(detector_modules)[0] mod_source = detector_modules[mod_key] dinfo = self.detector_info(mod_source) module = ' '.join(mod_key) dims = ' x '.join(str(d) for d in dinfo['dims']) print(" e.g. module {} : {} pixels".format(module, dims)) print(" {}".format(mod_source)) print(" {} frames per train, up to {} frames total".format( dinfo['frames_per_train'], dinfo['total_frames'] )) print() # Invert aliases for faster lookup. src_aliases = defaultdict(set) srckey_aliases = defaultdict(lambda: defaultdict(set)) for alias, literal in self._aliases.items(): if isinstance(literal, str): src_aliases[literal].add(alias) else: srckey_aliases[literal[0]][literal[1]].add(alias) def src_alias_list(s): if src_aliases[s]: alias_str = ', '.join(src_aliases[s]) return f'<{alias_str}>' return '' def src_data_detail(s, keys, prefix=''): """Detail for how much data is present for an instrument group""" if not keys: return counts = self.get_data_counts(s, list(keys)[0]) ntrains_data = (counts > 0).sum() print( f'{prefix}data for {ntrains_data} trains ' f'({ntrains_data / train_count:.2%}), ' f'up to {counts.max()} entries per train' ) def keys_detail(s, keys, prefix=''): """Detail for a group of keys""" for k in keys: entry_shape = self.get_entry_shape(s, k) if entry_shape: entry_info = f", entry shape {entry_shape}" else: entry_info = "" dt = self.get_dtype(s, k) if k in srckey_aliases[s]: alias_str = ' <' + ', '.join(srckey_aliases[s][k]) + '>' else: alias_str = '' print(f"{prefix}{k}{alias_str}\t[{dt}{entry_info}]") if details_for_sources: # All instrument sources with details enabled. displayed_inst_srcs = self.instrument_sources - self.legacy_sources.keys() print(len(displayed_inst_srcs), 'instrument sources:') else: # Only non-XTDF instrument sources without details enabled. displayed_inst_srcs = self.instrument_sources - self.detector_sources - self.legacy_sources.keys() print(len(displayed_inst_srcs), 'instrument sources (excluding XTDF detectors):') for s in sorted(displayed_inst_srcs): print(' -', s, src_alias_list(s)) if not any(p.match(s) for p in details_sources_re): continue # Detail for instrument sources: for group, keys in groupby(sorted(self.keys_for_source(s)), key=lambda k: k.split('.')[0]): print(f' - {group}:') keys = list(keys) src_data_detail(s, keys, prefix=' ') keys_detail(s, keys, prefix=' - ') print() print(len(self.control_sources), 'control sources:') for s in sorted(self.control_sources): print(' -', s, src_alias_list(s)) if any(p.match(s) for p in details_sources_re): # Detail for control sources: list keys ctrl_keys = self[s].keys(inc_timestamps=False) print(' - Control keys (1 entry per train):') keys_detail(s, sorted(ctrl_keys), prefix=' - ') run_keys = self._sources_data[s].files[0].get_run_keys(s) run_keys = {k[:-6] for k in run_keys if k.endswith('.value')} run_only_keys = run_keys - ctrl_keys if run_only_keys: print(' - Additional run keys (1 entry per run):') for k in sorted(run_only_keys): if k in srckey_aliases[s]: alias_str = ' <' + ', '.join(srckey_aliases[s][k]) + '>' else: alias_str = '' ds = self._sources_data[s].files[0].file[ f"/RUN/{s}/{k.replace('.', '/')}/value" ] entry_shape = ds.shape[1:] if entry_shape: entry_info = f", entry shape {entry_shape}" else: entry_info = "" dt = ds.dtype if h5py.check_string_dtype(dt): dt = 'string' print(f" - {k}{alias_str}\t[{dt}{entry_info}]") print() if self.legacy_sources: # Collect legacy souces matching DETECTOR_SOURCE_RE # separately for a condensed view. detector_legacy_sources = defaultdict(set) print(len(self.legacy_sources), 'legacy source names:') for s in sorted(self.legacy_sources.keys()): m = DETECTOR_SOURCE_RE.match(s) if m is not None: detector_legacy_sources[m[1]].add(s) else: # Only print non-XTDF legacy sources. print(' -', s, '->', self.legacy_sources[s]) for legacy_det, legacy_sources in detector_legacy_sources.items(): canonical_mod = self.legacy_sources[next(iter(legacy_sources))] canonical_det = DETECTOR_SOURCE_RE.match(canonical_mod)[1] print(' -', f'{legacy_det}/*', '->', f'{canonical_det}/*', f'({len(legacy_sources)})') print() def plot_missing_data(self, min_saved_pct=95, expand_instrument=False): """Plot sources that have missing data for some trains. Example output: .. image:: _static/plot_missing_data.png Parameters ---------- min_saved_pct: int or float, optional Only show sources with less than this percentage of trains saved. expand_instrument: bool, optional Show subsections within INSTRUMENT groups. These sections usually have the same data missing, but it's possible for them to differ. """ n_trains = len(self.train_ids) # Helper function that returns an alias for a source if one is # available, and the source name otherwise. def best_src_name(src): for alias, alias_ident in self._aliases.items(): if isinstance(alias_ident, str) and alias_ident == src: return alias return src # Check how much data is missing for each source run_tids = np.array(self.train_ids) start = time.time() counts = { } for src in self.all_sources: srcdata = self[src] if expand_instrument and srcdata.is_instrument: for group in srcdata.index_groups: counts[f"{best_src_name(src)} {group}.*"] = \ srcdata.data_counts(labelled=False, index_group=group) else: counts[best_src_name(src)] = srcdata.data_counts(labelled=False) # Warn the user if the function will take longer than a couple seconds if start is not None and (time.time() - start) > 2: print(f"Checking sources in {len(self.files)} files, this may take a minute...") # Set the start time to a dummy value so the message will # never be printed again. start = None # Identify the sources with less than min_saved_pct% of trains flaky_sources = {} save_pcts = {} for name, cnt in counts.items(): src_tids = run_tids[cnt > 0] save_pct = len(src_tids) / n_trains * 100 if save_pct <= min_saved_pct: flaky_sources[name] = src_tids save_pcts[name] = save_pct # Sort the flaky sources by decreasing order of how many trains they're missing flaky_sources = dict(sorted( flaky_sources.items(), key=lambda x: (len(x[1]), x[0]), reverse=True )) # Plot missing data import matplotlib.pyplot as plt from matplotlib.lines import Line2D fig, ax = plt.subplots(figsize=(9, max(3, len(flaky_sources) / 3.5))) bar_height = 0.5 for i, src in enumerate(flaky_sources): # First find all the trains that are missing save_line = np.zeros(n_trains).astype(bool) save_line[np.intersect1d(self.train_ids, flaky_sources[src], return_indices=True)[1]] = True # Loop over each train to find blocks of trains that are either # present or missing. bars = { } block_start = 0 for idx in range(n_trains): if save_line[idx] != save_line[block_start]: # If we find a train that doesn't match the save status of # the current block, create a new entry in `bars` to record # the start index, the block length, and the save status. bars[(block_start, idx - block_start)] = save_line[block_start] block_start = idx # Add the last block bars[(block_start, n_trains - block_start)] = save_line[block_start] # Plot all the blocks ax.broken_barh(bars.keys(), (i, bar_height), color=["deeppink" if x else "k" for x in bars.values()]) # Set labels and ticks tick_labels = [f"{src} ({save_pcts[src]:.2f}%)" for i, (src, tids) in enumerate(flaky_sources.items())] ax.set_yticks(np.arange(len(flaky_sources)) + bar_height / 2, labels=tick_labels, fontsize=8) ax.set_xlabel("Train ID index") # Set title title = f"Sources with less than {min_saved_pct}% of trains saved" run_meta = self.run_metadata() if "proposalNumber" in run_meta and "runNumber" in run_meta: title += f" in p{run_meta['proposalNumber']}, run {run_meta['runNumber']}" ax.set_title(title, pad=25 + len(flaky_sources) * 0.25) # Create legend legend_elements = [Line2D([0], [0], marker='o', color='w', label=label, markerfacecolor=c, markersize=6) for c, label in [("k", "Missing"), ("deeppink", "Present")]] # bbox_factor is a variable that tries to scale down the bounding box of # the legend as the height of the plot grows with more sources. It's # necessary because the bounding box coordinates are relative to the # plot size, so with a tall plot the figure/legend padding will be # massive. 7000 is a magic number that seems to give good results. bbox_factor = 1 - len(flaky_sources) / 7000 ax.legend(handles=legend_elements, bbox_to_anchor=(0, 1.02 * bbox_factor, 1, 0.1 * bbox_factor), loc='lower center', ncol=2, borderaxespad=0) fig.tight_layout() return ax def detector_info(self, source): """Get statistics about the detector data. Returns a dictionary with keys: - 'dims' (pixel dimensions) - 'frames_per_train' (estimated from one file) - 'total_frames' (estimated assuming all trains have data) """ source_files = self._sources_data[source].files file0 = sorted(source_files, key=lambda fa: fa.filename)[0] _, counts = file0.get_index(source, 'image') counts = set(np.unique(counts)) counts.discard(0) if len(counts) > 1: warn("Varying number of frames per train: %s" % counts) if counts: fpt = int(counts.pop()) else: fpt = 0 dims = file0.file['/INSTRUMENT/{}/image/data'.format(source)].shape[-2:] return { 'dims': dims, # Some trains have 0 frames; max is the interesting value 'frames_per_train': fpt, 'total_frames': fpt * len(self.train_ids), } def train_info(self, train_id): """Show information about a specific train in the run. Parameters ---------- train_id: int The specific train ID you get details information. Raises ------ ValueError if `train_id` is not found in the run. """ if train_id not in self.train_ids: raise ValueError("train {} not found in run.".format(train_id)) files = [f for f in self.files if f.has_train_ids([train_id], self.inc_suspect_trains)] ctrl = set().union(*[f.control_sources for f in files]) inst = set().union(*[f.instrument_sources for f in files]) # disp print('Train [{}] information'.format(train_id)) print('Devices') print('\tInstruments') [print('\t-', d) for d in sorted(inst)] or print('\t-') print('\tControls') [print('\t-', d) for d in sorted(ctrl)] or print('\t-') def train_timestamps(self, labelled=False, *, pydatetime=False, euxfel_local_time=False): """Get approximate timestamps for each train Timestamps are stored and returned in UTC by default. Older files (before format version 1.0) do not have timestamp data, and the returned data in those cases will have the special value NaT (Not a Time). If *labelled* is True, they are returned in a pandas series, labelled with train IDs. If *pydatetime* is True, a list of Python datetime objects (truncated to microseconds) is returned, the same length as data.train_ids. Otherwise (by default), timestamps are returned as a NumPy array with datetime64 dtype. *euxfel_local_time* can be True when either *labelled* or *pydatetime* is True. In this case, timestamps are converted to the `Europe/Berlin` timezone. """ arr = np.zeros(len(self.train_ids), dtype=np.uint64) id_to_ix = {tid: i for (i, tid) in enumerate(self.train_ids)} missing_tids = np.array(self.train_ids) for fa in self.files: tids, file_ixs, _ = np.intersect1d( fa.train_ids, missing_tids, return_indices=True ) if not self.inc_suspect_trains: valid = fa.validity_flag[file_ixs] tids, file_ixs = tids[valid], file_ixs[valid] if tids.size == 0 or 'INDEX/timestamp' not in fa.file: continue file_tss = fa.file['INDEX/timestamp'][:] for tid, ts in zip(tids, file_tss[file_ixs]): arr[id_to_ix[tid]] = ts missing_tids = np.setdiff1d(missing_tids, tids) if missing_tids.size == 0: # We've got a timestamp for every train break arr = arr.astype('datetime64[ns]') epoch = np.uint64(0).astype('datetime64[ns]') arr[arr == epoch] = 'NaT' # Not a Time if labelled: import pandas as pd series = pd.Series(arr, index=self.train_ids).dt.tz_localize('UTC') return series.dt.tz_convert('Europe/Berlin') if euxfel_local_time else series elif pydatetime: from datetime import datetime, timezone res = [] for npdt in arr: pydt = npdt.astype('datetime64[ms]').item() if pydt is not None: # Numpy NaT becomes None pydt = pydt.replace(tzinfo=timezone.utc) if euxfel_local_time: from zoneinfo import ZoneInfo pydt = pydt.astimezone(ZoneInfo('Europe/Berlin')) res.append(pydt) return res elif euxfel_local_time: raise ValueError( 'The euxfel_local_time option ' + 'can only be used if either labelled or pydatetime ' + 'are set to True' ) return arr def run_metadata(self) -> dict: """Get a dictionary of metadata about the run From file format version 1.0, the files capture: creationDate, daqLibrary, dataFormatVersion, karaboFramework, proposalNumber, runNumber, sequenceNumber, updateDate. """ if not self.is_single_run: raise MultiRunError() return self.files[0].metadata() def write(self, filename): """Write the selected data to a new HDF5 file You can choose a subset of the data using methods like :meth:`select` and :meth:`select_trains`, then use this write it to a new, smaller file. The target filename will be overwritten if it already exists. """ from .writer import FileWriter FileWriter(filename, self).write() def write_virtual(self, filename): """Write an HDF5 file with virtual datasets for the selected data. This doesn't copy the data, but each virtual dataset provides a view of data spanning multiple sequence files, which can be accessed as if it had been copied into one big file. This is *not* the same as `building virtual datasets to combine multi-module detector data `__. See :doc:`agipd_lpd_data` for that. Creating and reading virtual datasets requires HDF5 version 1.10. The target filename will be overwritten if it already exists. """ from .writer import VirtualFileWriter VirtualFileWriter(filename, self).write() def get_virtual_dataset(self, source, key, filename=None): """Create an HDF5 virtual dataset for a given source & key A dataset looks like a multidimensional array, but the data is loaded on-demand when you access it. So it's suitable as an interface to data which is too big to load entirely into memory. This returns an h5py.Dataset object. This exists in a real file as a 'virtual dataset', a collection of links pointing to the data in real datasets. If *filename* is passed, the file is written at that path, overwriting if it already exists. Otherwise, it uses a new temp file. To access the dataset from other worker processes, give them the name of the created file along with the path to the dataset inside it (accessible as ``ds.name``). They will need at least HDF5 1.10 to access the virtual dataset, and they must be on a system with access to the original data files, as the virtual dataset points to those. """ self._check_field(source, key) from .writer import VirtualFileWriter if filename is None: # Make a temp file to hold the virtual dataset. fd, filename = tempfile.mkstemp(suffix='-karabo-data-vds.h5') os.close(fd) vfw = VirtualFileWriter(filename, self) vfw.write_train_ids() ds_path = vfw.add_dataset(source, key) vfw.write_indexes() vfw.write_metadata() vfw.set_writer() vfw.file.close() # Close the file for writing and reopen read-only f = h5py.File(filename, 'r') return f[ds_path] class TrainIterator: """Iterate over trains in a collection of data Created by :meth:`DataCollection.trains`. """ def __init__( self, data, require_all=True, flat_keys=False, keep_dims=False): self.data = data self.require_all = require_all self.keep_dims = keep_dims # {(source, key): (f, dataset)} self._datasets_cache = {} self._set_result = self._set_result_flat if flat_keys \ else self._set_result_nested @staticmethod def _set_result_nested(res, source, key, value): try: res[source][key] = value except KeyError: res[source] = {key: value} @staticmethod def _set_result_flat(res, source, key, value): res[(source, key)] = value def _find_data(self, source, key, tid): file, ds = self._datasets_cache.get((source, key), (None, None)) if ds: ixs = (file.train_ids == tid).nonzero()[0] if self.data.inc_suspect_trains and ixs.size > 0: return file, ixs[0], ds for ix in ixs: if file.validity_flag[ix]: return file, ix, ds data = self.data section = 'CONTROL' if source in data.control_sources else 'INSTRUMENT' path = '/{}/{}/{}'.format(section, source, key.replace('.', '/')) f, pos = data._find_data(source, tid) if f is not None: ds = f.file[path] self._datasets_cache[(source, key)] = (f, ds) return f, pos, ds return None, None, None def _assemble_data(self, tid): res = {} for source in self.data.control_sources: self._set_result(res, source, 'metadata', {'source': source, 'timestamp.tid': tid}) for key in self.data.keys_for_source(source): file, pos, ds = self._find_data(source, key, tid) if ds is None: continue firsts, counts = file.get_index(source, '') first, count = firsts[pos], counts[pos] if not count: continue self._set_result(res, source, key, ds[first]) for source in self.data.instrument_sources: self._set_result(res, source, 'metadata', {'source': source, 'timestamp.tid': tid}) for key in self.data.keys_for_source(source): file, pos, ds = self._find_data(source, key, tid) if ds is None: continue group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] if count == 1 and not self.keep_dims: self._set_result(res, source, key, ds[first]) elif count > 0: self._set_result(res, source, key, ds[first : first + count]) return res def __iter__(self): for tid in self.data.train_ids: tid = int(tid) # Convert numpy int to regular Python int if self.require_all and self.data._check_data_missing(tid): continue yield tid, self._assemble_data(tid) def H5File(path, *, inc_suspect_trains=True): """Open a single HDF5 file generated at European XFEL. :: file = H5File("RAW-R0017-DA01-S00000.h5") Returns a :class:`DataCollection` object. Parameters ---------- path: str Path to the HDF5 file inc_suspect_trains: bool If False, suspect train IDs within a file are skipped. In newer files, trains where INDEX/flag are 0 are suspect. For older files which don't have this flag, out-of-sequence train IDs are suspect. If True (default), it tries to include these trains. """ return DataCollection.from_path(path, inc_suspect_trains=inc_suspect_trains) def RunDirectory( path, include='*', file_filter=locality.lc_any, *, inc_suspect_trains=True, parallelize=True, _use_voview=True, ): """Open a European XFEL run directory. :: run = RunDirectory("/gpfs/exfel/exp/XMPL/201750/p700000/raw/r0001") A run directory contains a number of HDF5 files with data from the same time period. Returns a :class:`DataCollection` object. Parameters ---------- path: str Path to the run directory containing HDF5 files. include: str Wildcard string to filter data files. file_filter: callable Function to subset the list of filenames to open. Meant to be used with functions in the extra_data.locality module. inc_suspect_trains: bool If False, suspect train IDs within a file are skipped. In newer files, trains where INDEX/flag are 0 are suspect. For older files which don't have this flag, out-of-sequence train IDs are suspect. If True (default), it tries to include these trains. parallelize: bool Enable or disable opening files in parallel. Particularly useful if creating child processes is not allowed (e.g. in a daemonized :class:`multiprocessing.Process`). """ files = [f for f in os.listdir(path) if f.endswith('.h5') and (f.lower() != 'overview.h5')] files = [osp.join(path, f) for f in fnmatch.filter(files, include)] sel_files = file_filter(files) if not sel_files: raise FileNotFoundError( f"No HDF5 files found in {path} with glob pattern {include}") if _use_voview and (sel_files == files): voview_file_acc = voview.find_file_valid(path) if voview_file_acc is not None: return DataCollection([voview_file_acc], is_single_run=True, ctx_closes=True) files_map = RunFilesMap(path) t0 = time.monotonic() d = DataCollection.from_paths( sel_files, files_map, inc_suspect_trains=inc_suspect_trains, is_single_run=True, parallelize=parallelize ) log.debug("Opened run with %d files in %.2g s", len(d.files), time.monotonic() - t0) files_map.save(d.files) return d # RunDirectory was previously RunHandler; we'll leave it accessible in case # any code was already using it. RunHandler = RunDirectory DEFAULT_ALIASES_FILE = "{}/usr/extra-data-aliases.yml" def open_run( proposal, run, data='default', include='*', file_filter=locality.lc_any, *, inc_suspect_trains=True, parallelize=True, aliases=DEFAULT_ALIASES_FILE, _use_voview=True, ): """Access European XFEL data by proposal and run number. :: run = open_run(proposal=700000, run=1) Returns a :class:`DataCollection` object. This finds the run directory in standard paths on EuXFEL infrastructure. Parameters ---------- proposal: str, int A proposal number, such as 2012, '2012', 'p002012', or a path such as '/gpfs/exfel/exp/SPB/201701/p002012'. run: str, int A run number such as 243, '243' or 'r0243'. data: str or Sequence of str 'raw', 'proc' (processed), or any other location relative to the proposal path with data per run to access. May also be 'default' (combining raw & proc), 'all' (combined but preferring proc where source names match) or a sequence of strings to load data from several locations, with later locations overwriting sources present in earlier ones. include: str Wildcard string to filter data files. file_filter: callable Function to subset the list of filenames to open. Meant to be used with functions in the extra_data.locality module. inc_suspect_trains: bool If False, suspect train IDs within a file are skipped. In newer files, trains where INDEX/flag are 0 are suspect. For older files which don't have this flag, out-of-sequence train IDs are suspect. If True (default), it tries to include these trains. parallelize: bool Enable or disable opening files in parallel. Particularly useful if creating child processes is not allowed (e.g. in a daemonized :class:`multiprocessing.Process`). aliases: str, Path Path to an alias file for the run, see the documentation for :meth:`DataCollection.with_aliases` for details. If the argument is a string with a format argument like ``{}/path/to/aliases.yml``, then the format argument will be replaced with the proposal directory path. By default it looks for a file named ``{}/usr/extra-data-aliases.yml``. """ absence_ok = set() if data == 'default': data = ['proc', 'raw'] absence_ok = {'proc'} elif data == 'all': data = ['raw', 'proc'] if isinstance(data, Sequence) and not isinstance(data, str): base_dc = None for origin in data: try: # Attempt to open data at this origin, but this may not # exist. origin_dc = open_run( proposal, run, data=origin, include=include, file_filter=file_filter, inc_suspect_trains=inc_suspect_trains, parallelize=parallelize, aliases=aliases, _use_voview=_use_voview, ) except FileNotFoundError: if origin not in absence_ok: if base_dc is None: raise warn(f'No data available for this run at origin {origin}') continue if base_dc is None: # First origin found base_dc = origin_dc continue # Deselect to those sources in the base not present in # this origin. base_extra = base_dc.deselect( [(src, '*') for src in base_dc.all_sources & origin_dc.all_sources]) if base_extra.files: # If base is not a subset of this origin, merge the # "extra" base sources into the origin sources and # re-enable is_single_run flag. base_dc = origin_dc.union(base_extra) base_dc.is_single_run = True else: # If the sources we previously found are a subset of those # in the latest origin, discard the previous data. base_dc = origin_dc return base_dc if isinstance(proposal, str): if ('/' not in proposal) and not proposal.startswith('p'): proposal = 'p' + proposal.rjust(6, '0') else: # Allow integers, including numpy integers proposal = 'p{:06d}'.format(index(proposal)) prop_dir = find_proposal(proposal) if isinstance(run, str): if run.startswith('r'): run = run[1:] else: run = index(run) # Allow integers, including numpy integers run = 'r' + str(run).zfill(4) dc = RunDirectory( osp.join(prop_dir, data, run), include=include, file_filter=file_filter, inc_suspect_trains=inc_suspect_trains, parallelize=parallelize, _use_voview=_use_voview, ) # Normalize string arguments to be an absolute Path if isinstance(aliases, str): aliases = Path(aliases.format(prop_dir)) # If we're using the default aliases file and it doesn't exist, ignore it # without throwing any errors. default_aliases = Path(DEFAULT_ALIASES_FILE.format(prop_dir)) if aliases == default_aliases and not default_aliases.is_file(): aliases = None if aliases is not None: dc = dc.with_aliases(aliases) log.info("Loading %d aliases from: %s", len(dc._aliases), aliases) return dc ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/run_files_map.py0000644000175100001660000002000414757376472020644 0ustar00runnerdockerimport json import logging import os import os.path as osp import numpy as np import re from tempfile import mkstemp import time SCRATCH_ROOT_DIR = "/gpfs/exfel/exp/" log = logging.getLogger(__name__) def atomic_dump(obj, path, **kwargs): """Write JSON to a file atomically This aims to avoid garbled files from multiple processes writing the same cache. It doesn't try to protect against e.g. sudden power failures, as forcing the OS to flush changes to disk may hurt performance. """ dirname, basename = osp.split(path) fd, tmp_filename = mkstemp(dir=dirname, prefix=basename) try: with open(fd, 'w') as f: json.dump(obj, f, **kwargs) except: os.unlink(tmp_filename) raise os.replace(tmp_filename, path) class RunFilesMap: """Cached data about HDF5 files in a run directory Stores the train IDs and source names in each file, along with some metadata to check that the cache is still valid. The cached information can be stored in: - (run dir)/karabo_data_map.json - (proposal dir)/scratch/.karabo_data_maps/raw_r0032.json """ cache_file = None expected_cache_keys = frozenset({ 'train_ids', 'control_sources', 'instrument_sources', 'suspect_train_indices', 'legacy_sources', }) def __init__(self, directory): self.directory = osp.abspath(directory) self.dir_stat = os.stat(self.directory) self.files_data = {} self.candidate_paths = self.map_paths_for_run(directory) self.load() def map_paths_for_run(self, directory): paths = [osp.join(directory, 'karabo_data_map.json')] # After resolving symlinks, data on Maxwell is stored in either # GPFS, e.g. /gpfs/exfel/d/proc/SCS/201901/p002212 or # dCache, e.g. /pnfs/xfel.eu/exfel/archive/XFEL/raw/SCS/201901/p002212 # On the online cluster the resolved path stay: # /gpfs/exfel/exp/inst/cycle/prop/(raw|proc)/run maxwell_match = re.match( # raw/proc instr cycle prop run r'.+/(raw|proc|red|open)/(\w+)/(\w+)/(p\d+)/(r\d+)/?$', os.path.realpath(directory) ) online_match = re.match( # instr cycle prop raw/proc run r'^.+/(\w+)/(\w+)/(p\d+)/(raw|proc)/(r\d+)/?$', os.path.realpath(directory) ) if maxwell_match or online_match: if maxwell_match: raw_proc, instr, cycle, prop, run_nr = maxwell_match.groups() else: instr, cycle, prop, raw_proc, run_nr = online_match.groups() fname = '%s_%s.json' % (raw_proc, run_nr) prop_scratch = osp.join( SCRATCH_ROOT_DIR, instr, cycle, prop, 'scratch' ) if osp.isdir(prop_scratch): paths.append( osp.join(prop_scratch, '.karabo_data_maps', fname) ) return paths def load(self): """Load the cached data This drops invalid or incomplete cache entries. """ loaded_data = [] t0 = time.monotonic() paths_mtimes = [] for path in self.candidate_paths: try: st = os.stat(path) paths_mtimes.append((path, st.st_mtime)) except (FileNotFoundError, PermissionError): pass # Try the newest found file (greatest mtime) first for path, _ in sorted(paths_mtimes, key=lambda x: x[1], reverse=True): try: with open(path) as f: loaded_data = json.load(f) self.cache_file = path log.debug("Loaded cached files map from %s", path) break except (FileNotFoundError, PermissionError, json.JSONDecodeError,): pass for info in loaded_data: filename = info['filename'] try: st = os.stat(osp.join(self.directory, filename)) except OSError: continue if self._cache_info_valid(info, st): self.files_data[filename] = info if loaded_data: dt = time.monotonic() - t0 log.debug("Loaded cached files map in %.2g s", dt) @classmethod def _cache_info_valid(cls, info, file_stat: os.stat_result): # Ignore the cached info if the file size or mtime have changed, or # if it is missing expected keys (likely keys added more recently). return ((file_stat.st_mtime == info['mtime']) and (file_stat.st_size == info['size']) and cls.expected_cache_keys.issubset(info.keys())) def is_my_directory(self, dir_path): return osp.samestat(os.stat(dir_path), self.dir_stat) def get(self, path): """Get cache entry for a file path Returns a dict or None """ dirname, fname = osp.split(osp.abspath(path)) if self.is_my_directory(dirname) and (fname in self.files_data): d = self.files_data[fname] res = { 'train_ids': np.array(d['train_ids'], dtype=np.uint64), 'control_sources': frozenset(d['control_sources']), 'instrument_sources': frozenset(d['instrument_sources']), 'legacy_sources': dict(d['legacy_sources']), } res['flag'] = flag = np.ones_like(d['train_ids'], dtype=np.bool_) flag[d['suspect_train_indices']] = 0 return res return None def save(self, files): """Save the cache if needed This skips writing the cache out if all the data files already have valid cache entries. It also silences permission errors from writing the cache file. """ need_save = False for file_access in files: dirname, fname = osp.split(osp.abspath(file_access.filename)) if self.is_my_directory(dirname) and fname not in self.files_data: log.debug("Will save cached data for %s", fname) need_save = True # It's possible that the file we opened has been replaced by a # new one before this runs. If possible, use the stat FileAccess got # from the file descriptor, which will always be accurate. # Stat-ing the filename will almost always work as a fallback. if isinstance(file_access.metadata_fstat, os.stat_result): st = file_access.metadata_fstat else: log.warning("No fstat for %r, will stat name instead", fname) st = os.stat(file_access.filename) self.files_data[fname] = { 'filename': fname, 'mtime': st.st_mtime, 'size': st.st_size, 'train_ids': [int(t) for t in file_access.train_ids], 'control_sources': sorted(file_access.control_sources), 'instrument_sources': sorted(file_access.instrument_sources), 'legacy_sources': {k: file_access.legacy_sources[k] for k in sorted(file_access.legacy_sources)}, 'suspect_train_indices': [ int(i) for i in (~file_access.validity_flag).nonzero()[0] ], } if need_save: t0 = time.monotonic() save_data = [info for (_, info) in sorted(self.files_data.items())] for path in self.candidate_paths: try: os.makedirs(osp.dirname(path), exist_ok=True) atomic_dump(save_data, path, indent=2) except PermissionError: continue else: dt = time.monotonic() - t0 log.debug("Saved run files map to %s in %.2g s", path, dt) return log.debug("Unable to save run files map") ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/sourcedata.py0000644000175100001660000004535114757376472020167 0ustar00runnerdockerimport fnmatch import re from typing import Dict, List, Optional import h5py import numpy as np from .exceptions import MultiRunError, NoDataError, PropertyNameError from .file_access import FileAccess from .keydata import KeyData from .read_machinery import (by_id, by_index, glob_wildcards_re, is_int_like, same_run, select_train_ids, split_trains, trains_files_index) class SourceData: """Data for one source Don't create this directly; get it from ``run[source]``. """ _device_class = ... _first_source_file = ... def __init__( self, source, *, sel_keys, train_ids, files, section, canonical_name, is_single_run, inc_suspect_trains=True, ): self.source = source self.sel_keys = sel_keys self.train_ids = train_ids self.files: List[FileAccess] = files self.section = section self.canonical_name = canonical_name self.is_single_run = is_single_run self.inc_suspect_trains = inc_suspect_trains def __repr__(self): return f"" @property def is_control(self): """Whether this source is a control source.""" return self.section == 'CONTROL' @property def is_instrument(self): """Whether this source is an instrument source.""" return self.section == 'INSTRUMENT' @property def is_legacy(self): """Whether this source is a legacy name for another source.""" return self.canonical_name != self.source def _has_exact_key(self, key): if self.sel_keys is not None: return key in self.sel_keys for f in self.files: return f.has_source_key(self.source, key) def __contains__(self, key): res = self._has_exact_key(key) if (not res) and self.is_control: res = self._has_exact_key(key + '.value') return res __iter__ = None # Disable iteration def __getitem__(self, key): if ( isinstance(key, (by_id, by_index, list, np.ndarray, slice)) or is_int_like(key) ): return self.select_trains(key) if key not in self: raise PropertyNameError(key, self.source) ds0 = self.files[0].file[ f"{self.section}/{self.source}/{key.replace('.', '/')}" ] if isinstance(ds0, h5py.Group): # This can only occur with a CONTROL key missing its .value suffix ds0 = ds0['value'] key += '.value' return KeyData( self.source, key, train_ids=self.train_ids, files=self.files, section=self.section, dtype=ds0.dtype, eshape=ds0.shape[1:], inc_suspect_trains=self.inc_suspect_trains, ) def _ipython_key_completions_(self): return list(self.keys(inc_timestamps=False)) def _get_first_source_file(self): first_kd = self[self.one_key()] try: # This property is an empty list if no trains are selected. sample_path = first_kd.source_file_paths[0] except IndexError: sample_path = first_kd.files[0].filename return FileAccess(sample_path) @property def storage_class(self): if self._first_source_file is ...: self._first_source_file = self._get_first_source_file() return self._first_source_file.storage_class @property def data_category(self): if self._first_source_file is ...: self._first_source_file = self._get_first_source_file() return self._first_source_file.data_category @property def aggregator(self): if self._first_source_file is ...: self._first_source_file = self._get_first_source_file() return self._first_source_file.aggregator def keys(self, inc_timestamps=True): """Get a set of key names for this source If you have used :meth:`select` to filter keys, only selected keys are returned. For control sources, each Karabo property is stored in the file as two keys, with '.value' and '.timestamp' suffixes. By default, these are given separately. Pass ``inc_timestamps=False`` to ignore timestamps and drop the '.value' suffix, giving names as used in Karabo. Only one file is used to find the keys. Within a run, all files should have the same keys for a given source, but if you use :meth:`union` to combine two runs where the source was configured differently, the result can be unpredictable. """ if (not inc_timestamps) and self.is_control: return {k[:-6] for k in self.keys() if k.endswith('.value')} if self.sel_keys is not None: return self.sel_keys # The same source may be in multiple files, but this assumes it has # the same keys in all files that it appears in. for f in self.files: return f.get_keys(self.source) def one_key(self, index_group=None): """Get a single (random) key for this source If you only need a single key, this can be much faster than calling :meth:`keys`. If *index_group* is omitted, the key may be part of any index group. """ if self.sel_keys is not None: if index_group is None: return next(iter(self.sel_keys)) prefix = f'{index_group}.' for key in self.sel_keys: if key.startswith(prefix): return key raise ValueError(f'none of the selected keys is part of ' f'`{index_group}`') for f in self.files: return f.get_one_key(self.source, index_group) @property def index_groups(self) -> set: """The part of keys needed to look up index data.""" if self.is_instrument: # For INSTRUMENT sources, the INDEX is saved by # key group, which is the first hash component. In # many cases this is 'data', but not always. if self.sel_keys is None: # All keys are selected. return self.files[0].index_groups(self.source) else: return {key.partition('.')[0] for key in self.sel_keys} else: # CONTROL data has no key group. return {''} def _glob_keys(self, pattern: str) -> Optional[set]: if self.is_control and not pattern.endswith(('.value', '*')): pattern += '.value' if pattern == '*': # When the selection refers to all keys, make sure this # is restricted to the current selection of keys for # this source. matched = self.sel_keys elif glob_wildcards_re.search(pattern) is None: # Selecting a single key (no wildcards in pattern) # This check should be faster than scanning all keys: matched = {pattern} if pattern in self else set() else: key_re = re.compile(fnmatch.translate(pattern)) matched = set(filter(key_re.match, self.keys())) if matched == set(): raise PropertyNameError(pattern, self.source) return matched def select_keys(self, keys) -> 'SourceData': """Select a subset of the keys in this source *keys* is either a single key name, a set of names, or a glob pattern (e.g. ``beamPosition.*``) matching a subset of keys. PropertyNameError is matched if a specified key does not exist. Returns a new :class:`SourceData` object. """ if isinstance(keys, str): keys = self._glob_keys(keys) elif keys: # If a specific set of keys is selected, make sure # they are all valid, adding .value as needed for CONTROl keys. normed_keys = set() for key in keys: if self._has_exact_key(key): normed_keys.add(key) elif self.is_control and self._has_exact_key(key + '.value'): normed_keys.add(key + '.value') else: raise PropertyNameError(key, self.source) keys = normed_keys else: # Catches both an empty set and None. # While the public API describes an empty set to # refer to all keys, the internal API actually uses # None for this case. This method is supposed to # accept both cases in order to natively support # passing a DataCollection as the selector. To keep # the conditions below clearer, any non-True value # is converted to None. keys = None if self.sel_keys is None: # Current keys are unspecific - use the specified keys new_keys = keys elif keys is None: # Current keys are specific but new selection is not - use current new_keys = self.sel_keys else: # Both the new and current keys are specific: take the intersection. # The check above should ensure this never results in an empty set, # but new_keys = self.sel_keys & keys assert new_keys return SourceData( self.source, sel_keys=new_keys, train_ids=self.train_ids, files=self.files, section=self.section, canonical_name=self.canonical_name, is_single_run=self.is_single_run, inc_suspect_trains=self.inc_suspect_trains ) def select_trains(self, trains) -> 'SourceData': """Select a subset of trains in this data as a new :class:`SourceData` object. """ return self._only_tids(select_train_ids(self.train_ids, trains)) def _only_tids(self, tids, files=None) -> 'SourceData': if files is None: files = [ f for f in self.files if f.has_train_ids(tids, self.inc_suspect_trains) ] if not files: # Keep 1 file, even if 0 trains selected, to get keys, dtypes, etc. files = [self.files[0]] return SourceData( self.source, sel_keys=self.sel_keys, train_ids=tids, files=files, section=self.section, canonical_name=self.canonical_name, is_single_run=self.is_single_run, inc_suspect_trains=self.inc_suspect_trains ) def drop_empty_trains(self, index_group=None): """Select only trains with data as a new :class:`SourceData` object. If *index_group* is omitted, those trains with data for any of this source's index groups are selected. """ counts = self.data_counts(labelled=False, index_group=index_group) tids = np.array(self.train_ids)[counts > 0] return self._only_tids(list(tids)) def split_trains(self, parts=None, trains_per_part=None): """Split this data into chunks with a fraction of the trains each. Either *parts* or *trains_per_part* must be specified. This returns an iterator yielding new :class:`SourceData` objects. The parts will have similar sizes, e.g. splitting 11 trains with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3. Selected trains count even if they are missing data, so different keys from the same run can be split into matching chunks. Parameters ---------- parts: int How many parts to split the data into. If trains_per_part is also specified, this is a minimum, and it may make more parts. It may also make fewer if there are fewer trains in the data. trains_per_part: int A maximum number of trains in each part. Parts will often have fewer trains than this. """ # tids_files points to the file for each train. # This avoids checking all files for each chunk, which can be slow. tids_files = trains_files_index( self.train_ids, self.files, self.inc_suspect_trains ) for sl in split_trains(len(self.train_ids), parts, trains_per_part): tids = self.train_ids[sl] files = set(tids_files[sl]) - {None} files = sorted(files, key=lambda f: f.filename) yield self._only_tids(tids, files=files) def data_counts(self, labelled=True, index_group=None): """Get a count of data entries in each train. if *index_group* is omitted, the largest count across all index groups is returned for each train. If *labelled* is True, returns a pandas series with an index of train IDs. Otherwise, returns a NumPy array of counts to match ``.train_ids``. """ if index_group is None: # Collect data counts for a sample key per index group. data_counts = { index_group: self[self.one_key(index_group)].data_counts( labelled=labelled) for index_group in self.index_groups } if labelled: import pandas as pd return pd.DataFrame(data_counts).max(axis=1) else: return np.stack(list(data_counts.values())).max(axis=0) else: return self[self.one_key(index_group)].data_counts( labelled=labelled) def train_id_coordinates(self, index_group=None): """Make an array of train IDs to use alongside data this source. If *index_group* is omitted, the shared train ID coordinates across all index groups is returned if there is one. Unlike for ``.data_counts()``, an exception is raised if the train ID coordinates (and thus data counts) differ among the index groups. """ if index_group is None: if len(self.index_groups) > 1: # Verify that a common train ID coordinate exists for # multiple index groups. The reads necessary for this # operation are identical to those for the train ID # coordinates themselves. counts_per_group = np.stack([ self.data_counts(labelled=False, index_group=index_group) for index_group in self.index_groups]) if (counts_per_group != counts_per_group[0]).any(): raise ValueError('source has index groups with differing ' 'data counts') index_group = self.index_groups.pop() return self[self.one_key(index_group)].train_id_coordinates() def run_metadata(self) -> Dict: """Get a dictionary of metadata about the run From file format version 1.0, the files capture: creationDate, daqLibrary, dataFormatVersion, karaboFramework, proposalNumber, runNumber, sequenceNumber, updateDate. """ if not self.is_single_run: raise MultiRunError() return self.files[0].metadata() def run_value(self, key, *, allow_multi_run=False): """Get a single value from the RUN section of data files. This method is intended for use with data from a single run. If you combine data from multiple runs, it will raise MultiRunError. Returns the RUN parameter value corresponding to the *key* argument. """ if not (self.is_single_run or allow_multi_run): raise MultiRunError() if self.is_instrument: raise ValueError('Only CONTROL sources have run values, ' f'{self.source} is an INSTRUMENT source') # Arbitrary file - should be the same across a run ds = self.files[0].file['RUN'][self.source].get(key.replace('.', '/')) if isinstance(ds, h5py.Group): # Allow for the .value suffix being omitted ds = ds.get('value') if not isinstance(ds, h5py.Dataset): raise PropertyNameError(key, self.source) val = ds[0] if isinstance(val, bytes): # bytes -> str return val.decode('utf-8', 'surrogateescape') return val def run_values(self, inc_timestamps=True): """Get a dict of all RUN values for this source This includes keys which are also in CONTROL. """ if not self.is_single_run: raise MultiRunError() if self.is_instrument: raise ValueError('Only CONTROL sources have run values, ' f'{self.source} is an INSTRUMENT source') res = {} def visitor(path, obj): if isinstance(obj, h5py.Dataset): val = obj[0] if isinstance(val, bytes): val = val.decode('utf-8', 'surrogateescape') res[path.replace('/', '.')] = val # Arbitrary file - should be the same across a run self.files[0].file['RUN'][self.source].visititems(visitor) if not inc_timestamps: return {k[:-6]: v for (k, v) in res.items() if k.endswith('.value')} return res @property def device_class(self): """The name of the Karabo device class which this source belongs to Only for CONTROL data. This will be None for INSTRUMENT data, or if it's not available in the files. """ if self._device_class is ...: try: self._device_class = self.run_value('classId', allow_multi_run=True) except (PropertyNameError, ValueError): self._device_class = None return self._device_class def union(self, *others) -> 'SourceData': """Combine two or more ``SourceData`` objects These must be for the same source, e.g. from separate runs. """ if len({sd.source for sd in (self,) + others}) > 1: raise ValueError("Cannot use SourceData.union() with different sources") keygroups = [sd.sel_keys for sd in (self,) + others] files = set(self.files) train_ids = set(self.train_ids) for other in others: files.update(other.files) train_ids.update(other.train_ids) return SourceData( self.source, sel_keys=None if (None in keygroups) else set().union(*keygroups), train_ids=sorted(train_ids), files=sorted(files, key=lambda f: f.filename), section=self.section, canonical_name=self.canonical_name, is_single_run=same_run(self, *others), inc_suspect_trains=self.inc_suspect_trains ) def __or__(self, other): return self.union(other) def __ior__(self, other): return self.union(other) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/stacking.py0000644000175100001660000002153014757376472017631 0ustar00runnerdockerimport numpy as np import re # numpy.exceptions exists from 1.25 onwards, but for Python 3.8 we still support # numpy 1.24. We can clean this up once we require Python >= 3.9. try: from numpy.exceptions import AxisError except ImportError: from numpy import AxisError __all__ = [ 'stack_data', 'stack_detector_data', ] def stack_data(train, data, axis=-3, xcept=()): """Stack data from devices in a train. For detector data, use stack_detector_data instead: it can handle missing modules, which this function cannot. The returned array will have an extra dimension. The data will be ordered according to any groups of digits in the source name, interpreted as integers. Other characters do not affect sorting. So: "B_7_0" < "A_12_0" < "A_12_1" Parameters ---------- train: dict Train data. data: str The path to the device parameter of the data you want to stack. axis: int, optional Array axis on which you wish to stack. xcept: list List of devices to ignore (useful if you have reccored slow data with detector data in the same run). Returns ------- combined: numpy.array Stacked data for requested data path. """ devices = [dev for dev in train.keys() if dev not in xcept] if not devices: raise ValueError("No data after filtering by 'xcept' argument.") dtypes = set() ordered_arrays = [] for device in sorted(devices, key=lambda d: list(map(int, re.findall(r'\d+', d)))): array = train[device][data] dtypes.add(array.dtype) ordered_arrays.append(array) if len(dtypes) > 1: raise ValueError("Arrays have mismatched dtypes: {}".format(dtypes)) return np.stack(ordered_arrays, axis=axis) def stack_detector_data( train, data, axis=-3, modules=16, fillvalue=None, real_array=True, *, pattern=r'/DET/(\d+)CH', starts_at=0, ): """Stack data from detector modules in a train. Parameters ---------- train: dict Train data. data: str The path to the device parameter of the data you want to stack, e.g. 'image.data'. axis: int Array axis on which you wish to stack (default is -3). modules: int Number of modules composing a detector (default is 16). fillvalue: number Value to use in place of data for missing modules. The default is nan (not a number) for floating-point data, and 0 for integers. real_array: bool If True (default), copy the data together into a real numpy array. If False, avoid copying the data and return a limited array-like wrapper around the existing arrays. This is sufficient for assembling images using detector geometry, and allows better performance. pattern: str Regex to find the module number in source names. Should contain a group which can be converted to an integer. E.g. ``r'/DET/JNGFR(\\d+)'`` for one JUNGFRAU naming convention. starts_at: int By default, uses module numbers starting at 0 (e.g. 0-15 inclusive). If the numbering is e.g. 1-16 instead, pass starts_at=1. This is not automatic because the first or last module may be missing from the data. Returns ------- combined: numpy.array Stacked data for requested data path. """ if not train: raise ValueError("No data") dtypes, shapes, empty_mods = set(), set(), set() modno_arrays = {} for src in train: det_mod_match = re.search(pattern, src) if not det_mod_match: raise ValueError(f"Source {src!r} doesn't match pattern {pattern!r}") modno = int(det_mod_match.group(1)) - starts_at try: array = train[src][data] except KeyError: continue dtypes.add(array.dtype) shapes.add(array.shape) modno_arrays[modno] = array if len(dtypes) > 1: raise ValueError("Arrays have mismatched dtypes: {}".format(dtypes)) if len(shapes) > 1: s1, s2, *_ = sorted(shapes) if len(shapes) > 2 or (s1[0] != 0) or (s1[1:] != s2[1:]): raise ValueError("Arrays have mismatched shapes: {}".format(shapes)) empty_mods = {n for n, a in modno_arrays.items() if a.shape == s1} for modno in empty_mods: del modno_arrays[modno] shapes.remove(s1) if max(modno_arrays) >= modules: raise IndexError("Module {} is out of range for a detector with {} modules" .format(max(modno_arrays), modules)) dtype = dtypes.pop() shape = shapes.pop() if fillvalue is None: fillvalue = np.nan if dtype.kind == 'f' else 0 fillvalue = dtype.type(fillvalue) # check value compatibility with dtype stack = StackView( modno_arrays, modules, shape, dtype, fillvalue, stack_axis=axis ) if real_array: return stack.asarray() return stack class StackView: """Limited array-like object holding detector data from several modules. Access is limited to either a single module at a time or all modules together, but this is enough to assemble detector images. """ def __init__(self, data, nmodules, mod_shape, dtype, fillvalue, stack_axis=-3): self._nmodules = nmodules self._data = data # {modno: array} self.dtype = dtype self._fillvalue = fillvalue self._mod_shape = mod_shape self.ndim = len(mod_shape) + 1 self._stack_axis = stack_axis if self._stack_axis < 0: self._stack_axis += self.ndim sax = self._stack_axis self.shape = mod_shape[:sax] + (nmodules,) + mod_shape[sax:] def __repr__(self): return "".format( self.shape, len(self._data), self._nmodules, self.dtype, ) # Multidimensional slicing def __getitem__(self, slices): if not isinstance(slices, tuple): slices = (slices,) missing_dims = self.ndim - len(slices) if Ellipsis in slices: ix = slices.index(Ellipsis) missing_dims += 1 slices = slices[:ix] + (slice(None, None),) * missing_dims + slices[ix + 1:] else: slices = slices + (slice(None, None),) * missing_dims modno = slices[self._stack_axis] mod_slices = slices[:self._stack_axis] + slices[self._stack_axis + 1:] if isinstance(modno, int): if modno < 0: modno += self._nmodules return self._get_single_mod(modno, mod_slices) elif modno == slice(None, None): return self._get_all_mods(mod_slices) else: raise Exception( "VirtualStack can only slice a single module or all modules" ) def _get_single_mod(self, modno, mod_slices): try: mod_data = self._data[modno] except KeyError: if modno >= self._nmodules: raise IndexError(modno) mod_data = np.full(self._mod_shape, self._fillvalue, self.dtype) self._data[modno] = mod_data # Now slice the module data as requested return mod_data[mod_slices] def _get_all_mods(self, mod_slices): new_data = {modno: self._get_single_mod(modno, mod_slices) for modno in self._data} new_mod_shape = list(new_data.values())[0].shape return StackView(new_data, self._nmodules, new_mod_shape, self.dtype, self._fillvalue) def asarray(self): """Copy this data into a real numpy array Don't do this until necessary - the point of using VirtualStack is to avoid copying the data unnecessarily. """ start_shape = (self._nmodules,) + self._mod_shape arr = np.full(start_shape, self._fillvalue, dtype=self.dtype) for modno, data in self._data.items(): arr[modno] = data return np.moveaxis(arr, 0, self._stack_axis) def squeeze(self, axis=None): """Drop axes of length 1 - see numpy.squeeze()""" if axis is None: slices = [0 if d == 1 else slice(None, None) for d in self.shape] elif isinstance(axis, (int, tuple)): if isinstance(axis, int): axis = (axis,) slices = [slice(None, None)] * self.ndim for ax in axis: try: slices[ax] = 0 except IndexError: raise AxisError( "axis {} is out of bounds for array of dimension {}" .format(ax, self.ndim) ) if self.shape[ax] != 1: raise ValueError("cannot squeeze out an axis with size != 1") else: raise TypeError("axis={!r} not supported".format(axis)) return self[tuple(slices)] ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8757262 extra_data-1.20.0/extra_data/tests/0000755000175100001660000000000014757376476016621 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/__init__.py0000644000175100001660000000000014757376472020714 0ustar00runnerdocker././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8757262 extra_data-1.20.0/extra_data/tests/cli/0000755000175100001660000000000014757376476017370 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/cli/__init__.py0000644000175100001660000000000014757376472021463 0ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/cli/test_make_virtual_cxi.py0000644000175100001660000000145614757376472024331 0ustar00runnerdockerimport os import os.path as osp from testpath import assert_isfile from extra_data.cli.make_virtual_cxi import main def test_make_virtual_cxi(mock_spb_proc_run, tmpdir): output = osp.join(str(tmpdir), 'test.cxi') main([mock_spb_proc_run, '-o', output]) assert_isfile(output) def test_make_virtual_cxi_runno(mock_spb_proc_run, tmpdir): proc = osp.join(str(tmpdir), 'proc') os.mkdir(proc) os.symlink(mock_spb_proc_run, osp.join(proc, 'r0238')) output = osp.join(str(tmpdir), 'test.cxi') # Pass proposal directory and run number main([str(tmpdir), '238', '-o', output]) assert_isfile(output) def test_make_virtual_cxi_jungfrau(mock_jungfrau_run, tmpdir): output = osp.join(str(tmpdir), 'test.cxi') main([mock_jungfrau_run, '-o', output]) assert_isfile(output) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/conftest.py0000644000175100001660000001636614757376472021030 0ustar00runnerdockerimport os import os.path as osp import h5py import numpy as np import pytest from tempfile import TemporaryDirectory from . import make_examples @pytest.fixture(scope='session', params=['0.5', '1.0', '1.2']) def format_version(request): return request.param @pytest.fixture(scope='module') def mock_agipd_data(format_version): # This one uses the older index format # (first/last/status instead of first/count) with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_example_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_lpd_data(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R9999-LPD00-S00000.h5') make_examples.make_lpd_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_fxe_control_data(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') make_examples.make_fxe_da_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_fxe_control_data1(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0451-DA01-S00001.h5') make_examples.make_fxe_da_file(path, firsttrain=20000, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_sa3_control_data(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') make_examples.make_sa3_da_file(path, format_version=format_version) yield path @pytest.fixture def mock_sa3_control_aliases(): return { 'sa3-xgm': 'SA3_XTD10_XGM/XGM/DOOCS', 'hv': ('SA3_XTD10_XGM/XGM/DOOCS', 'pulseEnergy.wavelengthUsed'), 'beam-x': ('SA3_XTD10_XGM/XGM/DOOCS', 'beamPosition.ixPos'), 'beam-y': ('SA3_XTD10_XGM/XGM/DOOCS', 'beamPosition.iyPos'), 'imgfel-frames': ('SA3_XTD10_IMGFEL/CAM/BEAMVIEW:daqOutput', 'data.image.pixels'), 'imgfel-frames2': ('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data.image.pixels'), 'imgfel-screen-pos': ('SA3_XTD10_IMGFEL/MOTOR/SCREEN', 'actualPosition'), 'imgfel-filter-pos': ('SA3_XTD10_IMGFEL/MOTOR/FILTER', 'actualPosition'), 'mcp-adc': 'SA3_XTD10_MCP/ADC/1', 'mcp-mpod': 'SA3_XTD10_MCP/MCPS/MPOD', 'mcp-voltage': ('SA3_XTD10_MCP/MCPS/MPOD', 'channels.U3.voltage'), 'mcp-trace': ('SA3_XTD10_MCP/ADC/1:channel_5.output', 'data.rawData'), 'bogus-source': 'SA4_XTD20_XGM/XGM/DOOCS', 'bogus-key': ('SA3_XTD10_XGM/XGM/DOOCS', 'foo') } @pytest.fixture(scope='module') def mock_control_data_with_empty_source(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0451-DA01-S00001.h5') make_examples.make_da_file_with_empty_source(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_spb_control_data_badname(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0309-DA01-S00000.h5') make_examples.make_data_file_bad_device_name(path, format_version=format_version) yield path @pytest.fixture(scope='session') def mock_fxe_raw_run(format_version): with TemporaryDirectory() as td: make_examples.make_fxe_run(td, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_lpd_parallelgain_run(): with TemporaryDirectory() as td: make_examples.make_lpd_parallelgain_run(td, format_version='1.0') yield td @pytest.fixture(scope='session') def mock_lpd_mini_gap_run(): with TemporaryDirectory() as td: make_examples.make_lpd_run_mini_missed_train(td) yield td @pytest.fixture(scope='session') def mock_spb_proc_run(format_version): with TemporaryDirectory() as td: make_examples.make_spb_run(td, raw=False, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_reduced_spb_proc_run(format_version): """Varying number of frames stored from AGIPD""" rng = np.random.RandomState(123) # Fix seed with TemporaryDirectory() as td: make_examples.make_reduced_spb_run(td, raw=False, rng=rng, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_spb_raw_run(format_version): with TemporaryDirectory() as td: make_examples.make_spb_run(td, format_version=format_version) yield td @pytest.fixture() def mock_spb_raw_and_proc_run(): with TemporaryDirectory() as td: prop_dir = osp.join(str(td), 'SPB', '201830', 'p002012') # Set up raw raw_run_dir = osp.join(prop_dir, 'raw', 'r0238') os.makedirs(raw_run_dir) make_examples.make_spb_run(raw_run_dir) # Set up proc proc_run_dir = osp.join(prop_dir, 'proc', 'r0238') os.makedirs(proc_run_dir) make_examples.make_spb_run(proc_run_dir, raw=False) yield td, raw_run_dir, proc_run_dir @pytest.fixture(scope='session') def mock_spb_raw_run_fmt1(): with TemporaryDirectory() as td: make_examples.make_spb_run(td, format_version="1.2") yield td @pytest.fixture(scope='session') def mock_modern_spb_proc_run(): with TemporaryDirectory() as td: make_examples.make_modern_spb_proc_run(td) yield td @pytest.fixture() def mock_spb_raw_and_modern_proc_run(): with TemporaryDirectory() as td: prop_dir = osp.join(str(td), 'SPB', '201830', 'p002012') # Set up raw raw_run_dir = osp.join(prop_dir, 'raw', 'r0238') os.makedirs(raw_run_dir) make_examples.make_spb_run(raw_run_dir) # Set up proc proc_run_dir = osp.join(prop_dir, 'proc', 'r0238') os.makedirs(proc_run_dir) make_examples.make_modern_spb_proc_run(proc_run_dir) yield td, raw_run_dir, proc_run_dir @pytest.fixture(scope='session') def mock_jungfrau_run(): with TemporaryDirectory() as td: make_examples.make_jungfrau_run(td) yield td @pytest.fixture(scope='session') def mock_fxe_jungfrau_run(): with TemporaryDirectory() as td: make_examples.make_fxe_jungfrau_run(td) yield td @pytest.fixture(scope='session') def mock_scs_run(): with TemporaryDirectory() as td: make_examples.make_scs_run(td) yield td @pytest.fixture(scope='session') def mock_remi_run(): with TemporaryDirectory() as td: make_examples.make_remi_run(td) yield td @pytest.fixture(scope='session') def empty_h5_file(): with TemporaryDirectory() as td: path = osp.join(td, 'empty.h5') with h5py.File(path, 'w'): pass yield path @pytest.fixture(scope='session') def mock_no_metadata_file(): with TemporaryDirectory() as td: path = osp.join(td, 'no_metadata.h5') with h5py.File(path, 'w') as f: f.create_dataset('INDEX/trainId', data=[], dtype=np.uint64) yield path @pytest.fixture(scope='session') def mock_empty_file(): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00002.h5') make_examples.make_sa3_da_file(path, ntrains=0) yield path ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/make_examples.py0000644000175100001660000005111714757376472022007 0ustar00runnerdockerimport os import os.path as osp import h5py import numpy as np from .mockdata import write_file from .mockdata.adc import ADC from .mockdata.agipd import AGIPD1MFPGA, AGIPD1MPSC, AGIPD500KFPGA, AGIPDMDL from .mockdata.base import write_base_index from .mockdata.basler_camera import BaslerCamera as BaslerCam from .mockdata.dctrl import DCtrl from .mockdata.detectors import AGIPDModule, DSSCModule, LPDModule from .mockdata.gauge import Gauge from .mockdata.gec_camera import GECCamera from .mockdata.imgfel import IMGFELCamera, IMGFELMotor from .mockdata.jungfrau import ( JUNGFRAUControl, JUNGFRAUModule, JUNGFRAUMonitor, JUNGFRAUPower ) from .mockdata.motor import Motor from .mockdata.mpod import MPOD from .mockdata.proc import ReconstructedDLD6 from .mockdata.tsens import TemperatureSensor from .mockdata.uvlamp import UVLamp from .mockdata.xgm import XGM vlen_bytes = h5py.special_dtype(vlen=bytes) def make_metadata(h5file, data_sources, chunksize=16): N = len(data_sources) if N % chunksize: N += chunksize - (N % chunksize) root = [ds.split('/', 1)[0] for ds in data_sources] devices = [ds.split('/', 1)[1] for ds in data_sources] sources_ds = h5file.create_dataset('METADATA/dataSourceId', (N,), dtype=vlen_bytes, maxshape=(None,)) sources_ds[:len(data_sources)] = data_sources root_ds = h5file.create_dataset('METADATA/root', (N,), dtype=vlen_bytes, maxshape=(None,)) root_ds[:len(data_sources)] = root devices_ds = h5file.create_dataset('METADATA/deviceId', (N,), dtype=vlen_bytes, maxshape=(None,)) devices_ds[:len(data_sources)] = devices def make_agipd_example_file(path, format_version='0.5'): """Make the structure of a data file from the AGIPD detector Based on /gpfs/exfel/d/proc/XMPL/201750/p700000/r0803/CORR-R0803-AGIPD07-S00000.h5 This has the old index format (first/last/status), whereas the other examples have the newer (first/count) format. """ f = h5py.File(path, 'w') slow_channels = ['header', 'detector', 'trailer'] channels = slow_channels + ['image'] train_ids = np.arange(10000, 10250) # Real train IDs are ~10^9 # RUN - empty in the example I'm working from f.create_group('RUN') # METADATA - lists the data sources in this file make_metadata(f, ['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/'+ch for ch in channels]) def make_train_ids(path): d = f.create_dataset(path, (256,), 'u8', maxshape=(None,)) d[:250] = train_ids # INDEX - matching up data to train IDs write_base_index(f, 250, format_version=format_version) for ch in channels: grp_name = 'INDEX/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/%s/' % ch first = f.create_dataset(grp_name + 'first', (256,), 'u8', maxshape=(None,)) last = f.create_dataset(grp_name + 'last', (256,), 'u8', maxshape=(None,)) status = f.create_dataset(grp_name + 'status', (256,), 'u4', maxshape=(None,)) if ch in slow_channels: first[:250] = np.arange(250) last[:250] = np.arange(250) else: first[:250] = np.arange(0, 16000, 64) last[:250] = np.arange(63, 16000, 64) status[:250] = 1 # INSTRUMENT - the data itself # first, train IDs for each channel for ch in slow_channels: make_train_ids('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/%s/trainId' % ch) fast_tids = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/trainId', (16000, 1), 'u8') fast_tids[:,0] = np.repeat(train_ids, 64) # TODO: Not sure what this is, but it has quite a regular structure. # 5408 = 13 x 13 x 32 f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/detector/data', (256, 5408), 'u1', maxshape=(None, 5408)) f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/dataId', (256,), 'u8', maxshape=(None,)) # Empty in example linkId = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/linkId', (256,), 'u8', maxshape=(None,)) linkId[:250] = 18446744069414584335 # Copied from example f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/magicNumberBegin', (256, 8), 'i1', maxshape=(None, 8)) # TODO: fill in data vmaj = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/majorTrainFormatVersion', (256,), 'u4', maxshape=(None,)) vmaj[:250] = 1 vmin = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/minorTrainFormatVersion', (256,), 'u4', maxshape=(None,)) vmin[:250] = 0 pc = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/pulseCount', (256,), 'u8', maxshape=(None,)) pc[:250] = 64 f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/reserved', (256, 16), 'u1', maxshape=(None, 16)) # Empty in example cellId = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/cellId', (16000, 1), 'u2') cellId[:, 0] = np.tile(np.arange(64), 250) # The data itself f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/data', (16000, 512, 128), 'f4') f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/gain', (16000, 512, 128), 'u1') length = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/length', (16000, 1), 'u4', maxshape=(None, 1)) length[:] = 262144 # = 512*128*4(bytes) ? f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/mask', (16000, 512, 128, 3), 'u1') # TODO: values 128 or 0 pulseId = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/pulseId', (16000, 1), 'u8') # In the real data, these are unevenly spaced, but this is close enough pulseId[:, 0] = np.tile(np.linspace(0, 125, 64, dtype='u8'), 250) f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/status', (16000, 1), 'u2') # Empty in example f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/trailer/checksum', (256, 16), 'i1', maxshape=(None, 16)) # Empty in example f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/trailer/magicNumberEnd', (256, 8), 'i1', maxshape=(None, 8)) # TODO: fill in data f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/trailer/status', (256,), 'u8', maxshape=(None,)) # Empty in example def make_fxe_da_file(path, format_version='0.5', firsttrain=10000): """Make the structure of a file with non-detector data from the FXE experiment Based on .../FXE/201830/p900023/r0450/RAW-R0450-DA01-S00001.h5 """ write_file(path, [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), GECCamera('FXE_XAD_GEC/CAM/CAMERA'), GECCamera('FXE_XAD_GEC/CAM/CAMERA_NODATA', nsamples=0) ], ntrains=400, chunksize=200, firsttrain=firsttrain, format_version=format_version) def make_sa3_da_file(path, ntrains=500, format_version='0.5'): """Make the structure of a file with non-detector data from SASE3 tunnel Based on .../SA3/201830/p900026/r0317/RAW-R0317-DA01-S00000.h5 """ write_file(path, [ ADC('SA3_XTD10_MCP/ADC/1', nsamples=0, channels=( 'channel_3.output/data', 'channel_5.output/data', 'channel_9.output/data', )), UVLamp('SA3_XTD10_MCP/DCTRL/UVLAMP'), Motor('SA3_XTD10_MCP/MOTOR/X2'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30100K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30160K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30180K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30190K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30200K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30250K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30260K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30280K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30300K'), Gauge('SA3_XTD10_VAC/GAUGE/G30470D_IN'), Gauge('SA3_XTD10_VAC/GAUGE/G30480D_IN'), Gauge('SA3_XTD10_VAC/GAUGE/G30490D_IN'), Gauge('SA3_XTD10_VAC/GAUGE/G30500P'), Gauge('SA3_XTD10_VAC/GAUGE/G30510C'), DCtrl('SA3_XTD10_VAC/DCTRL/D6_APERT_IN_OK'), DCtrl('SA3_XTD10_VAC/DCTRL/D12_APERT_IN_OK'), XGM('SA3_XTD10_XGM/XGM/DOOCS'), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW', nsamples=0), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2', nsamples=250), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW3', nsamples=200), IMGFELMotor('SA3_XTD10_IMGFEL/MOTOR/FILTER'), IMGFELMotor('SA3_XTD10_IMGFEL/MOTOR/SCREEN'), MPOD('SA3_XTD10_MCP/MCPS/MPOD'), ], ntrains=ntrains, chunksize=50, format_version=format_version) def make_da_file_with_empty_source(path, ntrains=500, format_version='0.5'): write_file(path, [ ADC('SA3_XTD10_MCP/ADC/1', nsamples=0, channels=( 'channel_3.output/data', 'channel_5.output/data', 'channel_9.output/data', )), UVLamp('SA3_XTD10_MCP/DCTRL/UVLAMP'), Motor('SA3_XTD10_MCP/MOTOR/X2'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30100K'), Gauge('SA3_XTD10_VAC/GAUGE/G30510C'), Gauge('SA3_XTD10_VAC/GAUGE/G30520C', no_ctrl_data=True), DCtrl('SA3_XTD10_VAC/DCTRL/D6_APERT_IN_OK'), XGM('SA3_XTD10_XGM/XGM/DOOCS'), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW', nsamples=0), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2', nsamples=250), IMGFELMotor('SA3_XTD10_IMGFEL/MOTOR/FILTER'), MPOD('SA3_XTD10_MCP/MCPS/MPOD'), ], ntrains=ntrains, chunksize=50, format_version=format_version) def make_data_file_bad_device_name(path, format_version='0.5'): """Not all devices have the Karabo standard A/B/C naming convention""" write_file(path, [ BaslerCam('SPB_IRU_SIDEMIC_CAM', sensor_size=(1000, 1000)) ], ntrains=500, chunksize=50, format_version=format_version) def make_agipd_file(path, format_version='0.5'): write_file(path, [ AGIPDModule('SPB_DET_AGIPD1M-1/DET/0CH0', frames_per_train=64) ], ntrains=486, chunksize=32, format_version=format_version) def make_lpd_file(path, format_version='0.5'): write_file(path, [ LPDModule('FXE_DET_LPD1M-1/DET/0CH0', frames_per_train=128) ], ntrains=480, chunksize=32, format_version=format_version) def make_fxe_run(dir_path, raw=True, format_version='0.5'): prefix = 'RAW' if raw else 'CORR' for modno in range(16): path = osp.join(dir_path, '{}-R0450-LPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ LPDModule('FXE_DET_LPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=128) ], ntrains=480, chunksize=32, format_version=format_version) if not raw: return write_file(osp.join(dir_path, 'RAW-R0450-DA01-S00000.h5'), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), GECCamera('FXE_XAD_GEC/CAM/CAMERA'), GECCamera('FXE_XAD_GEC/CAM/CAMERA_NODATA', nsamples=0), ], ntrains=400, chunksize=200, format_version=format_version) write_file(osp.join(dir_path, '{}-R0450-DA01-S00001.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), GECCamera('FXE_XAD_GEC/CAM/CAMERA'), GECCamera('FXE_XAD_GEC/CAM/CAMERA_NODATA', nsamples=0), ], ntrains=80, firsttrain=10400, chunksize=200, format_version=format_version) def make_lpd_parallelgain_run(dir_path, raw=True, format_version='0.5'): prefix = 'RAW' if raw else 'CORR' for modno in range(16): path = osp.join(dir_path, '{}-R0450-LPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ LPDModule('FXE_DET_LPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=300) ], ntrains=100, chunksize=32, format_version=format_version) def make_lpd_run_mini_missed_train(dir_path): write_file(osp.join(dir_path, 'RAW-R0450-LPD00-S00000.h5'), [ LPDModule('FXE_DET_LPD1M-1/DET/0CH0', frames_per_train=10), ], ntrains=5, chunksize=5, format_version='1.0') mod1_f = osp.join(dir_path, 'RAW-R0450-LPD01-S00000.h5') write_file(mod1_f, [ LPDModule('FXE_DET_LPD1M-1/DET/1CH0', frames_per_train=10), ], ntrains=4, chunksize=5, format_version='1.0') # Modify the file for module 1, as if it missed train 10002 # & fill some data to check in the test. with h5py.File(mod1_f, 'r+') as f: f['INDEX/trainId'][:4] = [10000, 10001, 10003, 10004] mod1_dset = f['INSTRUMENT/FXE_DET_LPD1M-1/DET/1CH0:xtdf/image/data'] mod1_dset[8::10, 0, 0, 0] = np.arange(1, 5) def make_spb_run(dir_path, raw=True, sensor_size=(1024, 768), format_version='0.5'): prefix = 'RAW' if raw else 'CORR' for modno in range(16): path = osp.join(dir_path, '{}-R0238-AGIPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ AGIPDModule('SPB_DET_AGIPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=64) ], ntrains=64, chunksize=32, format_version=format_version) if not raw: return write_file(osp.join(dir_path, '{}-R0238-DA01-S00000.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=sensor_size) ], ntrains=32, chunksize=32, format_version=format_version) write_file(osp.join(dir_path, '{}-R0238-DA01-S00001.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=sensor_size) ], ntrains=32, firsttrain=10032, chunksize=32, format_version=format_version) def make_reduced_spb_run(dir_path, raw=True, rng=None, format_version='0.5'): # Simulate reduced AGIPD data, with varying number of frames per train. # Counts across modules should be consistent prefix = 'RAW' if raw else 'CORR' if rng is None: rng = np.random.RandomState() frame_counts = rng.randint(0, 20, size=64) for modno in range(16): path = osp.join(dir_path, '{}-R0238-AGIPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ AGIPDModule('SPB_DET_AGIPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=frame_counts) ], ntrains=64, chunksize=32, format_version=format_version) if modno == 9 and not raw: # For testing masked_data with h5py.File(path, 'a') as f: mask_ds = f['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/9CH0:xtdf/image/mask'] mask_ds[0, 0, :32] = np.arange(32) write_file(osp.join(dir_path, '{}-R0238-DA01-S00000.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=(1024, 768)) ], ntrains=32, chunksize=32, format_version=format_version) write_file(osp.join(dir_path, '{}-R0238-DA01-S00001.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=(1024, 768)) ], ntrains=32, firsttrain=10032, chunksize=32, format_version=format_version) def make_modern_spb_proc_run(dir_path, format_version='1.2'): for modno in range(16): path = osp.join(dir_path, f'CORR-R0142-AGIPD{modno:0>2}-S00000.h5') write_file(path, [ AGIPDModule(f'SPB_DET_AGIPD1M-1/CORR/{modno}CH0', channel_name='output', raw=False, frames_per_train=32, legacy_name=f'SPB_DET_AGIPD1M-1/DET/{modno}CH0') ], ntrains=64, chunksize=32, format_version=format_version) def make_agipd1m_run( dir_path, rep_rate=True, gain_setting=True, integration_time=True, bias_voltage=True ): # Naming based on /gpfs/exfel/exp/SPB/202130/p900203/raw/r9015 for modno in range(16): path = osp.join(dir_path, f'RAW-R9015-AGIPD{modno:02}-S00000.h5') write_file(path, [ AGIPDModule( f'SPB_DET_AGIPD1M-1/DET/{modno}CH0', frames_per_train=64) ], ntrains=100, chunksize=32, format_version='1.0') write_file(osp.join(dir_path, 'RAW-R9015-AGIPD1MCTRL00-S00000.h5'), [ AGIPDMDL( 'SPB_IRU_AGIPD1M1/MDL/FPGA_COMP', rep_rate=rep_rate, gain_setting=gain_setting, integration_time=integration_time, ), AGIPD1MFPGA('SPB_IRU_AGIPD1M1/FPGA/MASTER_H1'), AGIPD1MPSC('SPB_IRU_AGIPD1M/PSC/HV', bias_voltage=bias_voltage), ], ntrains=100, chunksize=1, format_version='1.0') def make_agipd500k_run(dir_path): # Naming based on /gpfs/exfel/exp/SPB/202130/p900203/raw/r9023 for modno in range(8): path = osp.join(dir_path, f'RAW-R9023-AGIPD{modno:02}-S00000.h5') write_file(path, [ AGIPDModule( f'HED_DET_AGIPD500K2G/DET/{modno}CH0', frames_per_train=64) ], ntrains=100, chunksize=32, format_version='1.0') write_file(osp.join(dir_path, 'RAW-R9023-AGIPD500K2G00-S00000.h5'), [ AGIPDMDL('HED_EXP_AGIPD500K2G/MDL/FPGA_COMP'), AGIPD500KFPGA('HED_EXP_AGIPD500K2G/FPGA/M_0'), ], ntrains=100, chunksize=1, format_version='1.0') def make_jungfrau_run(dir_path): # Naming based on /gpfs/exfel/exp/SPB/202022/p002732/raw/r0012 for modno in range(1, 9): path = osp.join(dir_path, f'RAW-R0012-JNGFR{modno:02}-S00000.h5') write_file(path, [ JUNGFRAUModule(f'SPB_IRDA_JF4M/DET/JNGFR{modno:02}') ], ntrains=100, chunksize=1, format_version='1.0') write_file(osp.join(dir_path, f'RAW-R0012-JNGFRCTRL00-S00000.h5'), [ JUNGFRAUControl('SPB_IRDA_JF4M/DET/CONTROL'), JUNGFRAUMonitor('SPB_IRDA_JF4M/MDL/MONITOR'), JUNGFRAUPower('SPB_IRDA_JF4M/MDL/POWER'), ], ntrains=100, chunksize=1, format_version='1.0') def make_fxe_jungfrau_run(dir_path): # Naming based on /gpfs/exfel/exp/FXE/202101/p002478/raw/ for modno in range(1, 3): path = osp.join(dir_path, f'RAW-R0012-JNGFR{modno:02}-S00000.h5') write_file(path, [ JUNGFRAUModule(f'FXE_XAD_JF1M/DET/JNGFR{modno:02}') ], ntrains=100, chunksize=1, format_version='1.0') path = osp.join(dir_path, f'RAW-R0052-JNGFR03-S00000.h5') write_file(path, [ JUNGFRAUModule(f'FXE_XAD_JF500K/DET/JNGFR03') ], ntrains=100, chunksize=1, format_version='1.0') with h5py.File(path, 'a') as f: # For testing masked_data mask_ds = f['INSTRUMENT/FXE_XAD_JF500K/DET/JNGFR03:daqOutput/data/mask'] mask_ds[0, 0, 0, :32] = np.arange(32) write_file(osp.join(dir_path, f'RAW-R0052-JNGFRCTRL00-S00000.h5'), [ JUNGFRAUControl('FXE_XAD_JF1M/DET/CONTROL'), JUNGFRAUControl('FXE_XAD_JF500K/DET/CONTROL'), ], ntrains=100, chunksize=1, format_version='1.0') def make_remi_run(dir_path): write_file(osp.join(dir_path, f'CORR-R0210-REMI01-S00000.h5'), [ ReconstructedDLD6('SQS_REMI_DLD6/DET/TOP'), ], ntrains=100, chunksize=1, format_version='1.0') def make_scs_run(dir_path): # Multiple sequence files for detector modules for modno in range(16): mod = DSSCModule(f'SCS_DET_DSSC1M-1/DET/{modno}CH0', frames_per_train=64) for seq in range(2): path = osp.join(dir_path, f'RAW-R0163-DSSC{modno:0>2}-S{seq:0>5}.h5') write_file(path, [mod], ntrains=64, firsttrain=(10000 + seq * 64), chunksize=32, format_version='1.0') if __name__ == '__main__': make_agipd_example_file('agipd_example.h5') make_fxe_da_file('fxe_control_example.h5') make_sa3_da_file('sa3_control_example.h5') make_agipd_file('agipd_example2.h5') make_lpd_file('lpd_example.h5') os.makedirs('fxe_example_run', exist_ok=True) make_fxe_run('fxe_example_run') os.makedirs('spb_example_run', exist_ok=True) make_spb_run('spb_example_run') print("Written examples.") ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8797262 extra_data-1.20.0/extra_data/tests/mockdata/0000755000175100001660000000000014757376476020404 5ustar00runnerdocker././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/__init__.py0000644000175100001660000000003714757376472022511 0ustar00runnerdockerfrom .mkfile import write_file ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/adc.py0000644000175100001660000000372514757376472021510 0ustar00runnerdockerfrom .base import DeviceBase class ADC(DeviceBase): def __init__(self, device_id, nsamples=None, channels=()): super().__init__(device_id, nsamples) self.output_channels = channels control_keys = [ ('config/softTrigTime', 'u4', ()), ('dacNode/dacCyclesSamples', 'u4', ()), ('dacNode/dacData', 'i4', (1024,)), ('dacNode/dacSkipSamples', 'u4', ()), ('dacNode/dacTrigger', 'u1', ()), ('dacNode/dacTriggerPeriod', 'u4', ()), ('dacNode/dacVoltageData', 'f8', (1000,)), ('dacNode/enableDAC', 'u1', ()), ('dacNode/voltageIntercept', 'f8', ()), ('dacNode/voltageSlope', 'f8', ()), ('delay', 'u4', ()), ('numberRawSamples', 'u4', ()), ('skipSamples', 'u4', ()), ('trainId', 'u8', ()), ('triggerTime', 'i4', ()), ('triggerTimeStat', 'u2', (1000,)), ] + sum(([ ('channel_%d/baseStart' % n, 'u4', ()), ('channel_%d/baseStop' % n, 'u4', ()), ('channel_%d/baseline' % n, 'f4', ()), ('channel_%d/calibrationFactor' % n, 'f8', ()), ('channel_%d/enablePeakComputation' % n, 'u1', ()), ('channel_%d/enableRawDataStreaming' % n, 'u1', ()), ('channel_%d/fixedBaseline' % n, 'f8', ()), ('channel_%d/fixedBaselineEna' % n, 'u1', ()), ('channel_%d/initialDelay' % n, 'u4', ()), ('channel_%d/numPulses' % n, 'u4', ()), ('channel_%d/peakMean' % n, 'f4', ()), ('channel_%d/peakSamples' % n, 'u4', ()), ('channel_%d/peakStd' % n, 'f4', ()), ('channel_%d/pulsePeriod' % n, 'u4', ()), ] for n in range(10)), []) instrument_keys = [ ('baseline', 'f8', ()), ('peakMean', 'f8', ()), ('peakStd', 'f8', ()), ('peaks', 'f4', (1000,)), ('rawBaseline', 'u4', ()), ('rawData', 'u2', (4096,)), ('rawPeaks', 'u4', (1000,)), ('samplesForBaseline', 'u4', ()), ('samplesPerPeak', 'u4', ()), ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/agipd.py0000644000175100001660000001136314757376472022042 0ustar00runnerdockerfrom .base import DeviceBase class AGIPDMDL(DeviceBase): def __init__( self, device_id, rep_rate=True, gain_setting=True, integration_time=True ): super().__init__(device_id) self.rep_rate = rep_rate self.gain_setting = gain_setting self.integration_time = integration_time # A sample of some of the available keys. self.control_keys = [ ('acquisitionTime', 'u2', ()), ('bunchStructure/nPulses', 'u2', ()), ('bunchStructure/firstPulse', 'u2', ()), ('bunchStructure/methodIndex', 'u2', ()), ('setupr', 'u2', ()), ('patternTypeIndex', 'u2', ()), ('gainModeIndex', 'u2', ()), ('egsgat', 'u2', ()), ('g1sgat', 'u2', ()), ('g2sgat', 'u2', ()), ('pcRowNr', 'u2', ()), ('t0Delay', 'u8', ()), ('ticolm', 'u2', ()), ('vrfcds', 'u2', ()), ] if integration_time: self.control_keys.append(('integrationTime', 'u2', ())) if rep_rate: self.control_keys.append( ('bunchStructure/repetitionRate', 'f8', ())) if self.gain_setting: self.control_keys.append(('gain', 'u2', ())) def write_control(self, f): super().write_control(f) ctrl_grp = f'CONTROL/{self.device_id}/' run_grp = f'RUN/{self.device_id}/' for grp in [ctrl_grp, run_grp]: g = f[grp] g['setupr/value'][()] = 32 g['patternTypeIndex/value'][()] = 4 g['gainModeIndex/value'][()] = 0 if self.integration_time: g['integrationTime/value'][()] = 15 if self.rep_rate: g['bunchStructure/repetitionRate/value'][()] = 4.5 if self.gain_setting: g['gain/value'][()] = 0 class AGIPD1MFPGA(DeviceBase): # A sample of some of the available keys. control_keys = [ ('adcLatency', 'u4', ()), ('adcTrigger', 'u4', ()), ('asicCS', 'u4', ()), ('bootId', 'u4', ()), ('commandCounter', 'u4', ()), ('delays', 'u4', (8,)), ('heartbeatInterval', 'i4', ()), ('integrationOffset', 'u4', ()), ('integrationPeriod', 'u4', ()), ('mask', 'u4', ()), ('performanceStatistics/messagingProblems', '|u1', ()), ('performanceStatistics/enable', '|u1', ()), ('performanceStatistics/processingLatency', 'f4', ()), ('performanceStatistics/maxProcessingLatency', 'u4', ()), ('performanceStatistics/numMessages', 'u4', ()), ('performanceStatistics/maxEventLoopLatency', 'u4', ()), ('port', 'i4', ()), ('sleepTime', 'f4', ()), ] class AGIPD500KFPGA(DeviceBase): # A sample of some of the available keys. control_keys = [ ('highVoltage/actual', 'u2', ()), ('highVoltage/target', 'u2', ()), ] def write_control(self, f): super().write_control(f) ctrl_grp = f'CONTROL/{self.device_id}/' run_grp = f'RUN/{self.device_id}/' for grp in [ctrl_grp, run_grp]: f[grp + 'highVoltage/actual/value'][()] = 200 class AGIPD1MPSC(DeviceBase): def __init__( self, device_id, bias_voltage=True, ): super().__init__(device_id) self.bias_voltage = bias_voltage # A sample of some of the available keys. self.control_keys = [ ('applyInProgress', '|u1', ()), ('autoRearm', '|u1', ()), ('channels/U0/status', 'i4', ()), ('channels/U0/switch', 'i4', ()), ('channels/U0/voltage', 'f4', ()), ('channels/U0/superVisionMaxTerminalVoltage', 'f4', ()), ('channels/U0/voltageRampRate', 'f4', ()), ('channels/U0/measurementCurrent', 'f4', ()), ('channels/U0/current', 'f4', ()), ('channels/U0/supervisionMaxCurrent', 'f4', ()), ('channels/U0/currentRiseRate', 'f4', ()), ('channels/U0/currentFallRate', 'f4', ()), ('channels/U0/measurementTemperature', 'i4', ()), ('channels/U0/supervisionBehavior', 'i4', ()), ('channels/U0/tripTimeMaxCurrent', 'i4', ()), ('channels/U0/configMaxSenseVoltage', 'f4', ()), ] if bias_voltage: self.control_keys.append( ('channels/U0/measurementSenseVoltage', 'f8', ())) def write_control(self, f): super().write_control(f) ctrl_grp = f'CONTROL/{self.device_id}/' run_grp = f'RUN/{self.device_id}/' if self.bias_voltage: for grp in [ctrl_grp, run_grp]: g = f[grp] g['channels/U0/measurementSenseVoltage/value'][()] = 300.0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/base.py0000644000175100001660000002162014757376472021665 0ustar00runnerdockerfrom datetime import datetime, timedelta, timezone import os.path as osp import re import h5py import numpy as np from packaging import version class DeviceBase: # Override these in subclasses control_keys = [] extra_run_values = [] output_channels = () instrument_keys = [] # These are set by write_file ntrains = 400 firsttrain = 10000 chunksize = 200 def __init__(self, device_id, nsamples=None, no_ctrl_data=False): """Create a dummy device :param str device_id: e.g. "SA1_XTD2_XGM/DOOCS/MAIN" :param int ntrains: e.g. 256 :param int nsamples: For INSTRUMENT data only. Default is ntrains. If more, should be a multiple of ntrains. If fewer, samples will be spread evenly across the trains. :param int chunksize: The sample dimension will be padded to a multiple of this. :param bool no_ctrl_data: mock a device that did not save data if set to True. """ self.device_id = device_id self.nsamples = nsamples self.no_ctrl_data = no_ctrl_data def write_control(self, f): """Write the CONTROL and RUN data, and the relevant parts of INDEX""" N = self.ntrains # INDEX i_first = f.create_dataset('INDEX/%s/first' % self.device_id, (N,), 'u8', maxshape=(None,)) i_count = f.create_dataset('INDEX/%s/count' % self.device_id, (N,), 'u8', maxshape=(None,)) i_first[:] = 0 if self.no_ctrl_data else np.arange(N) i_count[:] = 0 if self.no_ctrl_data else 1 # CONTROL & RUN # Creating empty datasets for now. if self.no_ctrl_data: N = 0 for (topic, datatype, dims) in self.control_keys: f.create_dataset('CONTROL/%s/%s/timestamp' % (self.device_id, topic), (N,), 'u8', maxshape=(None,)) f.create_dataset('CONTROL/%s/%s/value' % (self.device_id, topic), (N,)+dims, datatype, maxshape=((None,)+dims)) # RUN is the value at the start of the run f.create_dataset('RUN/%s/%s/timestamp' % (self.device_id, topic), (1,), 'u8', maxshape=(None,)) f.create_dataset('RUN/%s/%s/value' % (self.device_id, topic), (1,)+dims, datatype, maxshape=((None,)+dims)) for (topic, datatype, value) in self.extra_run_values: if isinstance(value, str): datatype = h5py.string_dtype('ascii') f.create_dataset('RUN/%s/%s/timestamp' % (self.device_id, topic), (1,), 'u8', maxshape=(None,)) f.create_dataset('RUN/%s/%s/value' % (self.device_id, topic), (1,) + dims, datatype, data=[value], maxshape=((None,) + dims)) def write_instrument(self, f): """Write the INSTRUMENT data, and the relevant parts of INDEX""" train0 = self.firsttrain if self.nsamples is None: self.nsamples = self.ntrains if self.ntrains == 0: first, count, trainids = [], [], [] elif self.nsamples == 0: first = count = 0 trainids = [] elif self.nsamples < self.ntrains: first = np.linspace(0, self.nsamples, endpoint=False, num=self.ntrains, dtype='u8') count = np.zeros((self.ntrains,), dtype='u8') count[:-1] = first[1:] - first[:-1] if count.sum() < self.nsamples: count[-1] = 1 assert count.sum() == self.nsamples trainids = np.linspace(train0, train0 + self.ntrains, endpoint=False, num=self.nsamples, dtype='u8') elif self.nsamples == self.ntrains: first = np.arange(self.ntrains) count = 1 trainids = np.arange(train0, train0 + self.ntrains) else: # nsamples > ntrains count = self.nsamples // self.ntrains first = np.arange(0, self.nsamples, step=count) trainids = np.repeat(np.arange(train0, train0 + self.ntrains), count) Npad = self.nsamples if Npad % self.chunksize: Npad += + self.chunksize - (Npad % self.chunksize) for channel in self.output_channels: dev_chan = '%s:%s' % (self.device_id, channel) # INDEX i_first = f.create_dataset('INDEX/%s/first' % dev_chan, (self.ntrains,), 'u8', maxshape=(None,)) i_count = f.create_dataset('INDEX/%s/count' % dev_chan, (self.ntrains,), 'u8', maxshape=(None,)) i_first[:] = first i_count[:] = count # INSTRUMENT tid = f.create_dataset('INSTRUMENT/%s/trainId' % dev_chan, (Npad,), 'u8', maxshape=(None,)) if len(trainids) > 0: tid[:self.nsamples] = trainids for (topic, datatype, dims) in self.instrument_keys: f.create_dataset('INSTRUMENT/%s/%s' % (dev_chan, topic), (Npad,) + dims, datatype, maxshape=((None,) + dims)) def datasource_ids(self): if self.control_keys: yield 'CONTROL/' + self.device_id if self.instrument_keys: for channel in self.output_channels: yield 'INSTRUMENT/%s:%s' % (self.device_id, channel) vlen_bytes = h5py.special_dtype(vlen=bytes) def write_metadata(h5file, data_sources, chunksize=16, format_version='0.5'): N = len(data_sources) if N % chunksize: N += chunksize - (N % chunksize) root = [ds.split('/', 1)[0] for ds in data_sources] devices = [ds.split('/', 1)[1] for ds in data_sources] if format_version == '0.5': data_sources_grp = h5file.create_group('METADATA') else: data_sources_grp = h5file.create_group('METADATA/dataSources') sources_ds = data_sources_grp.create_dataset('dataSourceId', (N,), dtype=vlen_bytes, maxshape=(None,)) sources_ds[:len(data_sources)] = data_sources root_ds = data_sources_grp.create_dataset('root', (N,), dtype=vlen_bytes, maxshape=(None,)) root_ds[:len(data_sources)] = root devices_ds = data_sources_grp.create_dataset('deviceId', (N,), dtype=vlen_bytes, maxshape=(None,)) devices_ds[:len(data_sources)] = devices if format_version != '0.5': h5file['METADATA/dataFormatVersion'] = [format_version.encode('ascii')] now = datetime.now(timezone.utc).replace(microsecond=0) updated_time = now + timedelta(minutes=5) h5file['METADATA/creationDate'] = [ now.strftime('%Y%m%dT%H%M%SZ').encode('ascii') ] h5file['METADATA/daqLibrary'] = [b'1.9.0'] h5file['METADATA/karaboFramework'] = [b'2.7.0'] h5file.create_dataset('METADATA/proposalNumber', dtype=np.uint32, data=[700000]) h5file.create_dataset( 'METADATA/runNumber', dtype=np.uint32, data=[int(re.findall(r".*-R([0-9]+)-.*", h5file.filename)[0])], ) h5file['METADATA/runType'] = [b'Test DAQ'] h5file['METADATA/sample'] = [b'No Sample'] # get sequence number fname_pattern = r'^(RAW|CORR)\-R\d+\-.*\-S(\d+).h5$' match = re.match(fname_pattern, osp.basename(h5file.filename)) sequence = int(match[2]) if match is not None else 0 h5file.create_dataset('METADATA/sequenceNumber', dtype=np.uint32, data=[sequence]) h5file['METADATA/updateDate'] = [ updated_time.strftime('%Y%m%dT%H%M%SZ').encode('ascii') ] def write_base_index(f, N, first=10000, chunksize=16, format_version='0.5'): """Make base datasets in the files index 3 dataset are created: flag, timestamp, trainId Real train IDs are much larger (~10^9), so hopefully these won't be mistaken for real ones. """ if N % chunksize: Npad = N + chunksize - (N % chunksize) else: Npad = N if format_version != '0.5': # flag ds = f.create_dataset('INDEX/flag', (Npad,), 'i4', maxshape=(None,)) ds[:N] = np.ones(N) # timestamps ds = f.create_dataset('INDEX/timestamp', (Npad,), 'u8', maxshape=(None,)) # timestamps are stored as a single uint64 with nanoseconds resolution ts = datetime.now(tz=timezone.utc).timestamp() * 10**9 ds[:N] = [ts + i * 10**8 for i in range(N)] if version.parse(format_version) >= version.parse("1.2"): # origin ds = f.create_dataset("INDEX/origin", (Npad,), 'i4', maxshape=(None,)) ds[:N] = -1 * np.ones(N) # trainIds ds = f.create_dataset('INDEX/trainId', (Npad,), 'u8', maxshape=(None,)) ds[:N] = np.arange(first, first + N) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/basler_camera.py0000644000175100001660000000305014757376472023530 0ustar00runnerdocker"""Script that creates a mock-run for the basler camera""" from .base import DeviceBase class BaslerCamera(DeviceBase): """ Basler Camera device Based on example /gpfs/exfel/exp/SPB/201930/p900061/raw/r0055/RAW-R0055-DA01-S00000.h5 """ def __init__(self, device_id, nsamples=None, sensor_size=None): """Create a dummy basler device that inherits from Device Base""" self.sensor_size = sensor_size or (2058, 2456) super(BaslerCamera, self).__init__(device_id, nsamples=nsamples) self.output_channels = ('daqOutput/data',) # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. self.instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', self.sensor_size), ('image/roiOffsets', 'u8', (2,)), ('image/binning', 'u8', (2,)), ('image/flipX', 'u1', ()), ('image/flipY', 'u1', ()) ] def write_instrument(self, f): super().write_instrument(f) # Add fixed metadata for channel in self.output_channels: image_grp = 'INSTRUMENT/{}:{}/image/'.format(self.device_id, channel) f[image_grp]['bitsPerPixel'][:self.nsamples] = 16 f[image_grp]['dims'][:self.nsamples] = self.sensor_size ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/control_common.py0000644000175100001660000000145614757376472024010 0ustar00runnerdockerinterlock_keys = [ ('interlock/AActCommand', 'u4', ()), ('interlock/AActionState', 'u4', ()), ('interlock/ACndAriOp', 'i2', ()), ('interlock/ACndComOp', 'i2', ()), ('interlock/ACndEnable', 'u1', ()), ('interlock/ACndFiltertime', 'i2', ()), ('interlock/ACndHysteresis', 'u1', ()), ('interlock/ACndSrc1Detail', 'i2', ()), ('interlock/ACndSrc2Detail', 'i2', ()), ('interlock/ACndThreshold', 'u1', ()), ('interlock/ACndValue1', 'u1', ()), ('interlock/ACndValue2', 'u1', ()), ('interlock/AConditionState', 'u4', ()), ('interlockOk', 'u1', ()), ('interlockOn', 'u1', ()), ] triggers_keys = [ ('trigger', 'u4', (1000,)), ] + sum(([ ('triggers/trig%d/enable' % n, 'u1', ()), ('triggers/trig%d/interval' % n, 'f8', ()), ] for n in range(1, 11)), []) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/dctrl.py0000644000175100001660000000353114757376472022064 0ustar00runnerdockerfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class DCtrl(DeviceBase): control_keys = [ ('ASafeValue', 'u1', ()), ('busy', 'u1', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('pwmCycleLimit', 'i2', ()), ('pwmDutyCycle', 'f4', ()), ('pwmFrequency', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ] + [ # TODO: is there a way to factor these out? ('interlock/AActionState', 'u4', ()), ('interlock/AConditionState', 'u4', ()), ('interlock/a1/AActCommand', 'u4', (1000,)), ('interlock/c1/ACndAriOp', 'i2', ()), ('interlock/c1/ACndComOp', 'i2', ()), ('interlock/c1/ACndEnable', 'u1', ()), ('interlock/c1/ACndFiltertime', 'i2', ()), ('interlock/c1/ACndHysteresis', 'u4', ()), ('interlock/c1/ACndSrc1Detail', 'i2', ()), ('interlock/c1/ACndSrc2Detail', 'i2', ()), ('interlock/c1/ACndThreshold', 'u4', ()), ('interlock/c1/ACndValue1', 'u4', ()), ('interlock/c1/ACndValue2', 'u1', ()), ('interlock/c2/ACndAriOp', 'i2', ()), ('interlock/c2/ACndComOp', 'i2', ()), ('interlock/c2/ACndEnable', 'u1', ()), ('interlock/c2/ACndFiltertime', 'i2', ()), ('interlock/c2/ACndHysteresis', 'u4', ()), ('interlock/c2/ACndSrc1Detail', 'i2', ()), ('interlock/c2/ACndSrc2Detail', 'i2', ()), ('interlock/c2/ACndThreshold', 'u4', ()), ('interlock/c2/ACndValue1', 'u4', ()), ('interlock/c2/ACndValue2', 'u1', ()), ('interlockOk', 'u1', ()), ('interlockOn', 'u1', ()), ] + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/detectors.py0000644000175100001660000001515214757376472022752 0ustar00runnerdockerimport numpy as np import h5py class DetectorModule: # Overridden in subclasses: image_dims = () detector_data_size = 0 # Set by write_file: ntrains = 100 firsttrain = 10000 chunksize = 32 output_parts = [ 'detector', 'header', 'image', 'trailer', ] def __init__(self, device_id, frames_per_train=64, raw=True, channel_name='xtdf', legacy_name=None): self.device_id = device_id self._frames_per_train = frames_per_train if not raw: # Raw data has an extra dimension, used in AGIPD to separate data # and gain. This dimension is removed by the calibration process. self.image_dims = self.image_dims[1:] self.raw = raw self.channel_name = channel_name self.legacy_name = legacy_name def write_control(self, f): """Write the CONTROL and RUN data, and the relevant parts of INDEX""" pass @property def image_keys(self): if self.raw: return [ ('data', 'u2', self.image_dims), ('length', 'u4', (1,)), ('status', 'u2', (1,)), ] else: return [ ('data', 'f4', self.image_dims), ('mask', 'u4', self.image_dims), ('gain', 'u1', self.image_dims), ('length', 'u4', (1,)), ('status', 'u2', (1,)), ] @property def other_keys(self): return [ ('detector/data', 'u1', (self.detector_data_size,)), ('header/dataId', 'u8', ()), ('header/linkId', 'u8', ()), ('header/magicNumberBegin', 'i1', (8,)), ('header/majorTrainFormatVersion', 'u4', ()), ('header/minorTrainFormatVersion', 'u4', ()), ('header/pulseCount', 'u8', ()), ('header/reserved', 'u1', (16,)), ('trailer/checksum', 'i1', (16,)), ('trailer/magicNumberEnd', 'i1', (8,)), ('trailer/status', 'u8', ()), ] @property def frames_per_train(self): if np.ndim(self._frames_per_train) == 0: return np.full(self.ntrains, self._frames_per_train, np.uint64) return self._frames_per_train def write_instrument(self, f): """Write the INSTRUMENT data, and the relevant parts of INDEX""" trainids = np.arange(self.firsttrain, self.firsttrain + self.ntrains) ntrains_pad = self.ntrains if ntrains_pad % self.chunksize: ntrains_pad += + self.chunksize - (ntrains_pad % self.chunksize) inst_source = f'{self.device_id}:{self.channel_name}' # INDEX for part in self.output_parts: i_first = f.create_dataset(f'INDEX/{inst_source}/{part}/first', (self.ntrains,), 'u8', maxshape=(None,)) i_count = f.create_dataset(f'INDEX/{inst_source}/{part}/count', (self.ntrains,), 'u8', maxshape=(None,)) if part == 'image': # First first is always 0 i_first[1:] = np.cumsum(self.frames_per_train)[:-1] i_count[:] = self.frames_per_train else: i_first[:] = np.arange(self.ntrains) i_count[:] = 1 # INSTRUMENT (image) nframes = self.frames_per_train.sum() tid_index = np.repeat(trainids, self.frames_per_train.astype(np.intp)) pid_index = np.concatenate([ np.arange(0, n, dtype='u8') for n in self.frames_per_train ]) if self.raw: # Raw data have an extra dimension (length 1) and an unlimited max # for the first dimension. ds = f.create_dataset(f'INSTRUMENT/{inst_source}/image/trainId', (nframes, 1), 'u8', maxshape=(None, 1)) ds[:, 0] = tid_index pid = f.create_dataset(f'INSTRUMENT/{inst_source}/image/pulseId', (nframes, 1), 'u8', maxshape=(None, 1)) pid[:, 0] = pid_index cid = f.create_dataset(f'INSTRUMENT/{inst_source}/image/cellId', (nframes, 1), 'u2', maxshape=(None, 1)) cid[:, 0] = pid_index # Cell IDs mirror pulse IDs for now else: # Corrected data drops the extra dimension, and maxshape==shape. f.create_dataset( f'INSTRUMENT/{inst_source}/image/trainId', (nframes,), 'u8', chunks=True, data=tid_index ) f.create_dataset( f'INSTRUMENT/{inst_source}/image/pulseId', (nframes,), 'u8', chunks=True, data=pid_index ) f.create_dataset( # Cell IDs mirror pulse IDs for now f'INSTRUMENT/{inst_source}/image/cellId', (nframes,), 'u2', chunks=True, data=pid_index ) max_len = None if self.raw else nframes for (key, datatype, dims) in self.image_keys: f.create_dataset(f'INSTRUMENT/{inst_source}/image/{key}', (nframes,) + dims, datatype, maxshape=((max_len,) + dims)) # INSTRUMENT (other parts) for part in ['detector', 'header', 'trailer']: ds = f.create_dataset(f'INSTRUMENT/{inst_source}/{part}/trainId', (ntrains_pad,), 'u8', maxshape=(None,)) ds[:self.ntrains] = trainids for (key, datatype, dims) in self.other_keys: f.create_dataset(f'INSTRUMENT/{inst_source}/{key}', (ntrains_pad,) + dims, datatype, maxshape=((None,) + dims)) if self.legacy_name is not None: # The legacy source name for corrected data is the same as for # raw data, which for these detectors always has the xtdf channel. f[f'INDEX/{self.legacy_name}:xtdf'] = h5py.SoftLink( f'/INDEX/{inst_source}') f[f'INSTRUMENT/{self.legacy_name}:xtdf'] = h5py.SoftLink( f'/INSTRUMENT/{inst_source}') def datasource_ids(self): for part in self.output_parts: yield f'INSTRUMENT/{self.device_id}:{self.channel_name}/{part}' if self.legacy_name is not None: for part in self.output_parts: yield f'INSTRUMENT/{self.legacy_name}:xtdf/{part}' class AGIPDModule(DetectorModule): image_dims = (2, 512, 128) detector_data_size = 5408 class LPDModule(DetectorModule): image_dims = (1, 256, 256) detector_data_size = 416 class DSSCModule(DetectorModule): image_dims = (1, 128, 512) detector_data_size = 416 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/gauge.py0000644000175100001660000000203114757376472022036 0ustar00runnerdockerfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class Gauge(DeviceBase): control_keys = [ ('AAlarmH', 'f4', ()), ('AAlarmL', 'f4', ()), ('AAverage', 'u1', ()), ('busy', 'u1', ()), ('calibration/expbase', 'f4', ()), ('calibration/formulaType', 'u1', ()), ('calibration/offset', 'f4', ()), ('calibration/rawValue', 'u4', ()), ('calibration/scale', 'f4', ()), ('calibration/terminalFactor', 'f4', ()), ('calibration/terminalOffset', 'f4', ()), ('epsSemiRaw', 'f4', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('relativeEpsilon', 'u1', ()), ('semiRawValue', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ('value', 'f4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/gec_camera.py0000644000175100001660000000405414757376472023023 0ustar00runnerdockerfrom .base import DeviceBase class GECCamera(DeviceBase): control_keys = [ ('acquisitionTime', 'f4', ()), ('binningX', 'i4', ()), ('binningY', 'i4', ()), ('coolingLevel', 'i4', ()), ('cropLines', 'i4', ()), ('enableBiasCorrection', 'u1', ()), ('enableBurstMode', 'u1', ()), ('enableCooling', 'u1', ()), ('enableCropMode', 'u1', ()), ('enableExtTrigger', 'u1', ()), ('enableShutter', 'u1', ()), ('enableSync', 'u1', ()), ('exposureTime', 'i4', ()), ('firmwareVersion', 'i4', ()), ('modelId', 'i4', ()), ('numPixelInX', 'i4', ()), ('numPixelInY', 'i4', ()), ('numberOfCoolingLevels', 'i4', ()), ('numberOfMeasurements', 'i4', ()), ('pixelSize', 'f4', ()), ('readOutSpeed', 'i4', ()), ('shutterCloseTime', 'i4', ()), ('shutterOpenTime', 'i4', ()), ('shutterState', 'i4', ()), ('syncHigh', 'u1', ()), ('targetTemperature', 'i4', ()), ('temperatureBack', 'f4', ()), ('temperatureSensor', 'f4', ()), ('triggerTimeOut', 'i4', ()), ('updateInterval', 'i4', ()), ] # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('daqOutput/data',) instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', (255, 1024)), ('image/roiOffsets', 'u8', (2,)), ] def write_instrument(self, f): super().write_instrument(f) # Fill in some fixed metadata about the image for channel in self.output_channels: image_grp = 'INSTRUMENT/%s:%s/image/' % (self.device_id, channel) f[image_grp + 'bitsPerPixel'][:self.nsamples] = 16 f[image_grp + 'dims'][:self.nsamples] = [1024, 255] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/imgfel.py0000644000175100001660000001003014757376472022207 0ustar00runnerdockerfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class IMGFELCamera(DeviceBase): control_keys = [ ('Logger/file/maxBackupIndex', 'u4', ()), ('Logger/file/maxFileSize', 'u4', ()), ('Logger/file/mode', 'u4', ()), ('acqFrameCount', 'u4', ()), ('autoGain', 'u1', ()), ('bin/X', 'i4', ()), ('bin/Y', 'i4', ()), ('exposureTime', 'f8', ()), ('flip/X', 'u1', ()), ('flip/Y', 'u1', ()), ('frameRate', 'f8', ()), ('frameTransmissionDelay', 'u4', ()), ('gain', 'f8', ()), ('imageDepth', 'i4', ()), ('interPacketDelay', 'u4', ()), ('latencyTime', 'f8', ()), ('nbFrames', 'i4', ()), ('packetSize', 'u4', ()), ('pollingInterval', 'i4', ()), ('roi/Height', 'i4', ()), ('roi/Width', 'i4', ()), ('roi/X', 'i4', ()), ('roi/Y', 'i4', ()), ('rotation', 'i4', ()), ('sensorSize/height', 'i4', ()), ('sensorSize/width', 'i4', ()), ('simulateCamera', 'u1', ()), ('socketBufferSize', 'u4', ()), ('temperature', 'f8', ()), ('writeFile', 'u1', ()), ] # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('daqOutput/data',) instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', (1944, 2592)), ('image/roiOffsets', 'u8', (2,)), ] class IMGFELMotor(DeviceBase): control_keys = [ ('ABackEMF', 'u2', ()), ('ACoilResistance', 'u2', ()), ('ADynOffsFactor', 'f4', ()), ('ADynOffsetType', 'u1', ()), ('AFunctionInput1', 'u1', ()), ('AFunctionInput2', 'u1', ()), ('AIntCounter7041', 'u1', ()), ('AMotorFullStep', 'u2', ()), ('AOffsetDynamic', 'f4', ()), ('AOverrun', 'f4', ()), ('aMax', 'i2', ()), ('acceleration', 'f4', ()), ('actualPosition', 'f4', ()), ('backlash', 'f4', ()), ('busy', 'u1', ()), ('calibrateTarget', 'f4', ()), ('checkLimitConsistency', 'u1', ()), ('controllerVoltage', 'f4', ()), ('deadband', 'f4', ()), ('encodeStep', 'f4', ()), ('epsilon', 'f4', ()), ('extEncoderEnabled', 'u1', ()), ('force', 'u1', ()), ('gear', 'f4', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('homeNoLimit', 'u1', ()), ('homeUp', 'u1', ()), ('invLogicLim1', 'u1', ()), ('invLogicLim2', 'u1', ()), ('isCCWLimit', 'u1', ()), ('isCWLimit', 'u1', ()), ('isIdleOpenLoop', 'u1', ()), ('isInternalCounter', 'u1', ()), ('isInvertLimits', 'u1', ()), ('isLimitless', 'u1', ()), ('isOnTarget', 'u1', ()), ('isSWLimitHigh', 'u1', ()), ('isSWLimitLow', 'u1', ()), ('isSlave', 'u1', ()), ('limitPosH', 'f4', ()), ('limitPosL', 'f4', ()), ('masterSlaveCorrelation', 'f4', ()), ('maxCurrent', 'u2', ()), ('maxUpdateFrequency', 'f4', ()), ('modus', 'u1', ()), ('motorDriverVoltage', 'f4', ()), ('offset', 'f4', ()), ('pConst', 'f4', ()), ('plcCycleAveraging', 'u1', ()), ('pollInterval', 'f4', ()), ('reducedCurrent', 'u2', ()), ('saveLimitPosition', 'u1', ()), ('softDeviceId', 'u4', ()), ('stepCounterPosition', 'f4', ()), ('stepLength', 'f4', ()), ('syncEncoder', 'u1', ()), ('targetPosition', 'f4', ()), ('targetVelocity', 'i2', ()), ('terminal', 'u4', ()), ('terminalTemperature', 'u1', ()), ('vMax', 'i2', ()), ('vMin', 'i2', ()), ('velocity', 'f4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/jungfrau.py0000644000175100001660000000543014757376472022575 0ustar00runnerdockerfrom .base import DeviceBase class JUNGFRAUModule(DeviceBase): output_channels = ('daqOutput/data',) def __init__(self, device_id, nsamples=None, raw=False): super().__init__(device_id, nsamples) self.raw = raw @property def instrument_keys(self): return [ ('frameNumber', 'u8', (16,)), ('gain', 'u1', (16, 512, 1024)), ('memoryCell', 'u1', (16,)), ('timestamp', 'f8', (16,)), ] + ([ ('adc', 'u2', (16, 512, 1024)), ] if self.raw else [ ('adc', 'f4', (16, 512, 1024)), ('mask', 'u4', (16, 512, 1024)), ]) class JUNGFRAUControl(DeviceBase): control_keys = [ ('acquisitionTime', 'f4', ()), ('angDir', 'i2', (1000,)), ('binSize', 'f4', (1000,)), ('bitDepth', 'i4', ()), ('dataStorage.enable', 'u1', ()), ('dataStorage.fileIndex', 'i4', ()), ('delayAfterTrigger', 'f4', (1000,)), ('detectorHostPort', 'u2', (1000,)), ('detectorHostStopPort', 'u2', (1000,)), ('exposurePeriod', 'f4', ()), ('exposureTime', 'f4', ()), ('exposureTimeout', 'u4', ()), ('exposureTimer', 'u2', ()), ('globalOff', 'f4', (1000,)), ('heartbeatInterval', 'i4', ()), ('lock', 'i2', (1000,)), ('master', 'i2', ()), ('maximumDetectorSize', 'i4', (1000,)), ('moveFlag', 'i2', (1000,)), ('numberOfCycles', 'i8', ()), ('numberOfFrames', 'i8', ()), ('numberOfGates', 'i8', ()), ('online', 'i2', (1000,)), ('performanceStatistics.enable', 'u1', ()), ('performanceStatistics.maxEventLoopLatency', 'u4', ()), ('performanceStatistics.maxProcessingLatency', 'u4', ()), ('performanceStatistics.messagingProblems', 'u1', ()), ('performanceStatistics.numMessages', 'u4', ()), ('performanceStatistics.processingLatency', 'f4', ()), ('pollingInterval', 'u4', ()), ('progress', 'i4', ()), ('rOnline', 'i2', ()), ('rxTcpPort', 'u2', (1000,)), ('rxUdpPort', 'u2', (1000,)), ('rxUdpSocketSize', 'u4', ()), ('storageCellStart', 'i2', ()), ('storageCells', 'i2', ()), ('threaded', 'i2', ()), ('triggerPeriod', 'f4', ()), ('vHighVoltage', 'u4', (1000,)), ('vHighVoltageMax', 'u4', ()), ] class JUNGFRAUMonitor(DeviceBase): control_keys = sum(([ (f'module{n}.adcTemperature', 'f8', ()), (f'module{n}.fpgaTemperature', 'f8', ()), ] for n in range(1, 9)), []) class JUNGFRAUPower(DeviceBase): control_keys = [ ('current', 'f8', ()), ('pollingInterval', 'f8', ()), ('port', 'u2', ()), ('temperature', 'f8', ()), ('voltage', 'f8', ()), ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/mkfile.py0000644000175100001660000000141214757376472022217 0ustar00runnerdockerimport h5py from .base import write_base_index, write_metadata def write_file(filename, devices, ntrains, firsttrain=10000, chunksize=200, format_version='0.5'): f = h5py.File(filename, 'w') f.create_group('RUN') # Add this, even if it's left empty write_base_index(f, ntrains, first=firsttrain, chunksize=chunksize, format_version=format_version) data_sources = [] for dev in devices: dev.ntrains = ntrains dev.firsttrain = firsttrain dev.chunksize = chunksize dev.write_control(f) dev.write_instrument(f) data_sources.extend(dev.datasource_ids()) write_metadata(f, data_sources, chunksize=chunksize, format_version=format_version) f.close() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/motor.py0000644000175100001660000001153014757376472022112 0ustar00runnerdockerfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class Motor(DeviceBase): control_keys = [ ('AActualVelocity', 'f8', ()), ('AEncoderResolution', 'u4', ()), ('AHomingVelocityOffPlcCam', 'f8', ()), ('AHomingVelocityToPlcCam', 'f8', ()), ('ANCParam', 'u4', ()), ('ANomCurrent', 'u4', ()), ('AOpenloopCurrent', 'u4', ()), ('APeakCurrent', 'u4', ()), ('AProfileAcceleration', 'u4', ()), ('AQuickStopDecceleration', 'u4', ()), ('AStandByCurrent', 'u4', ()), ('AStepperResolution', 'u4', ()), ('Acontrolword', 'u2', ()), ('actualPosition', 'f8', ()), ('axisBacklash', 'f8', ()), ('busy', 'u1', ()), ('calibrateTarget', 'f8', ()), ('enableSWLimitHigh', 'u1', ()), ('enableSWLimitLow', 'u1', ()), ('encoderPosition', 'f8', ()), ('epsilon', 'f4', ()), ('epsilonActualPosition', 'f8', ()), ('epsilonActualVelocity', 'f8', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('isCCWLimit', 'u1', ()), ('isCWLimit', 'u1', ()), ('isInterlockLimitHigh', 'u1', ()), ('isInterlockLimitLow', 'u1', ()), ('isOnTarget', 'u1', ()), ('isSWLimitHigh', 'u1', ()), ('isSWLimitLow', 'u1', ()), ('isSlave', 'u1', ()), ('maxUpdateFrequency', 'f4', ()), ('mc2/aaxisacc', 'f8', ()), ('mc2/aaxiscalibrationvelocitybackward', 'f8', ()), ('mc2/aaxiscalibrationvelocityforward', 'f8', ()), ('mc2/aaxiscycletime', 'f8', ()), ('mc2/aaxisdec', 'f8', ()), ('mc2/aaxisdelaytimeveloposition', 'f8', ()), ('mc2/aaxisenableposcorrection', 'u1', ()), ('mc2/aaxisenbacklashcompensation', 'u1', ()), ('mc2/aaxisencoderdirectioninverse', 'u1', ()), ('mc2/aaxisencodermask', 'u4', ()), ('mc2/aaxisencodermodulovalue', 'f8', ()), ('mc2/aaxisencoderoffset', 'f8', ()), ('mc2/aaxisencoderscalingfactor', 'f8', ()), ('mc2/aaxisendatapersistence', 'u1', ()), ('mc2/aaxisenintargettimeout', 'u1', ()), ('mc2/aaxisenloopingdistance', 'u1', ()), ('mc2/aaxisenpositionlagmonitoring', 'u1', ()), ('mc2/aaxisenpositionrangemonitoring', 'u1', ()), ('mc2/aaxisentargetpositionmonitoring', 'u1', ()), ('mc2/aaxisfastacc', 'f8', ()), ('mc2/aaxisfastjerk', 'f8', ()), ('mc2/aaxisfaststopsignaltype', 'u4', ()), ('mc2/aaxisid', 'f8', ()), ('mc2/aaxisintargettimeout', 'f8', ()), ('mc2/aaxisjerk', 'f8', ()), ('mc2/aaxisjogincrementbackward', 'f8', ()), ('mc2/aaxisjogincrementforward', 'f8', ()), ('mc2/aaxisloopingdistance', 'f8', ()), ('mc2/aaxismanualvelocityfast', 'f8', ()), ('mc2/aaxismanualvelocityslow', 'f8', ()), ('mc2/aaxismaxposlagfiltertime', 'f8', ()), ('mc2/aaxismaxposlagvalue', 'f8', ()), ('mc2/aaxismaxvelocity', 'f8', ()), ('mc2/aaxismodulotolerancewindow', 'f8', ()), ('mc2/aaxismotionmonitoringtime', 'f8', ()), ('mc2/aaxismotionmonitoringwindow', 'f8', ()), ('mc2/aaxismotordirectioninverse', 'u1', ()), ('mc2/aaxisoverridetype', 'f8', ()), ('mc2/aaxisposcorrectionfiltertime', 'f8', ()), ('mc2/aaxispositionrangewindow', 'f8', ()), ('mc2/aaxisrapidtraversevelocity', 'f8', ()), ('mc2/aaxisrefveloonrefoutput', 'f8', ()), ('mc2/aaxistargetpositionmonitoringtime', 'f8', ()), ('mc2/aaxistargetpositionwindow', 'f8', ()), ('mc2/aaxisunitinterpretation', 'f8', ()), ('mc2/acommandedvelocity', 'f8', ()), ('mc2/aencoderaxisoffset', 'f8', ()), ('mc2/aencoderaxisscalingfactor', 'f8', ()), ('mc2/aencoderreferencemode', 'u1', ()), ('mc2/ahomingvelocitoffplccam', 'f8', ()), ('mc2/ahomingvelocittowardsplccam', 'f8', ()), ('mc2/ainvertdircalibrationcamsearch', 'u1', ()), ('mc2/ainvertdirsyncpulssearch', 'u1', ()), ('mc2/amodulotargetposition', 'f8', ()), ('mc2/amovedirection', 'i4', ()), ('mc2/ancsvbcycletime', 'f8', ()), ('mc2/axisenmotionmonitoring', 'u1', ()), ('mc2/axisfastdec', 'f8', ()), ('mc2/extendedStateWord', 'u4', ()), ('mc2/ncsafcycletime', 'f8', ()), ('mc2ContinuousMotion', 'u1', ()), ('mc2DiscreteMotion', 'u1', ()), ('mc2ErrorStop', 'u1', ()), ('pollInterval', 'f4', ()), ('softDeviceId', 'u4', ()), ('specificError', 'u4', ()), ('stepSize', 'f8', ()), ('swLimitHigh', 'f8', ()), ('swLimitLow', 'f8', ()), ('targetPosition', 'f8', ()), ('targetVelocity', 'f8', ()), ('terminal', 'u4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/mpod.py0000644000175100001660000001106614757376472021715 0ustar00runnerdockerfrom .base import DeviceBase class MPOD(DeviceBase): control_keys = [ ('Logger/file/maxBackupIndex', 'u4', ()), ('Logger/file/maxFileSize', 'u4', ()), ('Logger/file/mode', 'u4', ()), ('autoRearm', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsEventActive', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsFineAdjustment', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsHardwareLimitVoltageGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsInputError', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsKillEnable', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsLiveInsertion', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsNoRamp', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsNoSumError', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsSafetyLoopGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsSupplyGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsTemperatureGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleNeedService', 'u1', ()), ('boards/board_0/boardStatus/bitModuleReserved1', 'u1', ()), ('boards/board_0/boardStatus/bitModuleReserved2', 'u1', ()), ('boards/board_0/boardStatus/bitModuleReserved3', 'u1', ()), ('boards/board_0/status', 'i4', ()), ('crateNode/crate', 'u4', ()), ('crateNode/crateStatus/bitBusReset', 'u1', ()), ('crateNode/crateStatus/bitFanTrayFailure', 'u1', ()), ('crateNode/crateStatus/bitInputFailure', 'u1', ()), ('crateNode/crateStatus/bitLocalControlOnly', 'u1', ()), ('crateNode/crateStatus/bitMainInError', 'u1', ()), ('crateNode/crateStatus/bitMainInhibit', 'u1', ()), ('crateNode/crateStatus/bitMainOn', 'u1', ()), ('crateNode/crateStatus/bitOutputFailure', 'u1', ()), ('crateNode/crateStatus/bitPlugAndPlayIncompatible', 'u1', ()), ('crateNode/crateStatus/bitSensorFailure', 'u1', ()), ('crateNode/crateStatus/bitSupplyDerating', 'u1', ()), ('crateNode/crateStatus/bitSupplyDerating2', 'u1', ()), ('crateNode/crateStatus/bitSupplyFailure', 'u1', ()), ('crateNode/crateStatus/bitSupplyFailure2', 'u1', ()), ('crateNode/crateStatus/bitVmeSysfail', 'u1', ()), ('crateNode/expandChannelStatus', 'u1', ()), ('crateNode/fanNominalSpeed', 'i4', ()), ('crateNode/groupsSwitchIseg', 'i4', ()), ('crateNode/indexes', 'u4', ()), ('crateNode/output', 'u4', ()), ('crateNode/outputNumber', 'i4', ()), ('crateNode/pollPeriod', 'i4', ()), ('crateNode/psOperatingTime', 'i4', ()), ('crateNode/secureOperation', 'u1', ()), ('crateNode/settlePeriod', 'i4', ()), ('crateNode/snmpPort', 'i4', ()), ('crateNode/snmpThreshold', 'i4', ()), ('crateNode/sysHardwareReset', 'i4', ()), ('crateNode/sysMainSwitch', 'i4', ()), ('crateNode/sysStatus', 'u4', ()), ('crateNode/system', 'u4', ()), ] + sum(([ ('channels/U%d/configMaxCurrent' % n, 'f4', ()), ('channels/U%d/configMaxSenseVoltage' % n, 'f4', ()), ('channels/U%d/configMaxTemperature' % n, 'i4', ()), ('channels/U%d/configMaxTerminalVoltage' % n, 'f4', ()), ('channels/U%d/current' % n, 'f4', ()), ('channels/U%d/currentFallRate' % n, 'f4', ()), ('channels/U%d/currentRiseRate' % n, 'f4', ()), ('channels/U%d/groupid' % n, 'i4', ()), ('channels/U%d/index' % n, 'i4', ()), ('channels/U%d/measurementCurrent' % n, 'f4', ()), ('channels/U%d/measurementSenseVoltage' % n, 'f4', ()), ('channels/U%d/measurementTemperature' % n, 'i4', ()), ('channels/U%d/measurementTerminalVoltage' % n, 'f4', ()), ('channels/U%d/status' % n, 'i4', ()), ('channels/U%d/supervisionBehavior' % n, 'i4', ()), ('channels/U%d/supervisionMaxCurrent' % n, 'f4', ()), ('channels/U%d/supervisionMaxPower' % n, 'f4', ()), ('channels/U%d/supervisionMaxSenseVoltage' % n, 'f4', ()), ('channels/U%d/supervisionMaxTemperature' % n, 'i4', ()), ('channels/U%d/supervisionMaxTerminalVoltage' % n, 'f4', ()), ('channels/U%d/supervisionMinSenseVoltage' % n, 'f4', ()), ('channels/U%d/switch' % n, 'i4', ()), ('channels/U%d/tripTimeMaxCurrent' % n, 'i4', ()), ('channels/U%d/voltage' % n, 'f4', ()), ('channels/U%d/voltageRampRate' % n, 'f4', ()), ] for n in range(8)), []) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/proc.py0000644000175100001660000000154214757376472021717 0ustar00runnerdocker """Script that creates mock-data for virtual processed devices. These are virtual devices that do not actually exist in Karabo for real, but are the result of processing raw data into a scientifically more useful representation or as a form of data reduction. """ import numpy as np from .base import DeviceBase class ReconstructedDLD6(DeviceBase): """ Reconstructed DLD6 data from ADQ digitizer traces. Based on example /gpfs/exfel/exp/SQS/202101/p002448/proc/r0210/CORR-R0210-REMI01-S00000.h5 """ hits_dt = np.dtype([ ('x', 'f8'), ('y', 'f8'), ('t', 'f8'), ('m', 'i4') ]) signals_dt = np.dtype([ (key, 'f8') for key in ['u1', 'u2', 'v1', 'v2', 'w1', 'w2', 'mcp'] ]) output_channels = ('output/rec',) instrument_keys = [('signals', signals_dt, (50,)), ('hits', hits_dt, (50,))] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/sidemic_camera.py0000644000175100001660000000202614757376472023677 0ustar00runnerdockerfrom .base import DeviceBase class SidemicCamera(DeviceBase): # Based on example in /gpfs/exfel/d/raw/SPB/201701/p002012/r0309/RAW-R0309-DA01-S00000.h5 # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('daqOutput/data',) instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', (2058, 2456)), ('image/roiOffsets', 'u8', (2,)), ] def write_instrument(self, f): super().write_instrument(f) # Fill in some fixed metadata about the image for channel in self.output_channels: image_grp = 'INSTRUMENT/%s:%s/image/' % (self.device_id, channel) f[image_grp + 'bitsPerPixel'][:self.nsamples] = 16 f[image_grp + 'dims'][:self.nsamples] = [1000, 1000] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/tsens.py0000644000175100001660000000204614757376472022110 0ustar00runnerdockerfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class TemperatureSensor(DeviceBase): control_keys = [ ('AAlarmH', 'f4', ()), ('AAlarmL', 'f4', ()), ('AAverage', 'u1', ()), ('busy', 'u1', ()), ('calibration/expbase', 'f4', ()), ('calibration/formulaType', 'u1', ()), ('calibration/offset', 'f4', ()), ('calibration/rawValue', 'u4', ()), ('calibration/scale', 'f4', ()), ('calibration/terminalFactor', 'f4', ()), ('calibration/terminalOffset', 'f4', ()), ('epsSemiRaw', 'f4', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('relativeEpsilon', 'u1', ()), ('semiRawValue', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ('value', 'f4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/uvlamp.py0000644000175100001660000000120514757376472022254 0ustar00runnerdockerfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class UVLamp(DeviceBase): control_keys = [ ('ASafeValue', 'u1', ()), ('busy', 'u1', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('pwmCycleLimit', 'i2', ()), ('pwmDutyCycle', 'f4', ()), ('pwmFrequency', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/mockdata/xgm.py0000644000175100001660000000652014757376472021550 0ustar00runnerdockerimport numpy as np from .base import DeviceBase class XGM(DeviceBase): control_keys = [ ('beamPosition/ixPos', 'f4', ()), ('beamPosition/iyPos', 'f4', ()), ('current/bottom/output', 'f4', ()), ('current/bottom/rangeCode', 'i4', ()), ('current/left/output', 'f4', ()), ('current/left/rangeCode', 'i4', ()), ('current/right/output', 'f4', ()), ('current/right/rangeCode', 'i4', ()), ('current/top/output', 'f4', ()), ('current/top/rangeCode', 'i4', ()), ('gasDosing/measuredPressure', 'f4', ()), ('gasDosing/pressureSetPoint', 'f4', ()), ('gasSupply/gasTypeId', 'i4', ()), ('gasSupply/gsdCompatId', 'i4', ()), ('pollingInterval', 'i4', ()), ('pressure/dcr', 'f4', ()), ('pressure/gasType', 'i4', ()), ('pressure/pressure1', 'f4', ()), ('pressure/pressureFiltered', 'f4', ()), ('pressure/rd', 'f4', ()), ('pressure/rsp', 'f4', ()), ('pulseEnergy/conversion', 'f8', ()), ('pulseEnergy/crossUsed', 'f4', ()), ('pulseEnergy/gammaUsed', 'f4', ()), ('pulseEnergy/gmdError', 'i4', ()), ('pulseEnergy/nummberOfBrunches', 'f4', ()), ('pulseEnergy/photonFlux', 'f4', ()), ('pulseEnergy/pressure', 'f4', ()), ('pulseEnergy/temperature', 'f4', ()), ('pulseEnergy/usedGasType', 'i4', ()), ('pulseEnergy/wavelengthUsed', 'f4', ()), ('signalAdaption/dig', 'i4', ()), ] extra_run_values = [ ('classId', None, 'DoocsXGM'), ] # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('output/data',) instrument_keys = [ ('intensityTD', 'f4', (1000,)), ('intensityAUXTD', 'f4', (1000,)), ('intensitySigma/x_data', 'f4', (1000,)), ('intensitySigma/y_data', 'f4', (1000,)), ('xTD', 'f4', (1000,)), ('yTD', 'f4', (1000,)), ] def write_instrument(self, f): super().write_instrument(f) # Annotate intensityTD with some units to test retrieving them # Karabo stores ASCII strings, assigning bytes is a shortcut to mimic that ds = f[f'INSTRUMENT/{self.device_id}:output/data/intensityTD'] ds.attrs['metricPrefixEnum']= np.array([14], dtype=np.int32) ds.attrs['metricPrefixName'] = b'micro' ds.attrs['metricPrefixSymbol'] = b'u' ds.attrs['unitEnum'] = np.array([15], dtype=np.int32) ds.attrs['unitName'] = b'joule' ds.attrs['unitSymbol'] = b'J' # Also annotate a CONTROL key, where attributes are split across # the parent key group and the value dataset. # (The timestamp dataset has its own, but distinct attributes) # Specific examples taken from p5696, r32 grp = f[f'CONTROL/{self.device_id}/beamPosition/ixPos'] grp.attrs['alias'] = b'IX.POS' grp.attrs['description'] = b'Calculated X position [mm]' grp.attrs['daqPolicy'] = np.array([-1], dtype=np.int32) # daqPolicy is intentionally different, the correct schema value # is -1 as above! ds = grp['value'] ds.attrs['alias'] = b'IX.POS' ds.attrs['daqPolicy'] = np.array([1], dtype=np.int32) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_aliases.py0000644000175100001660000003055214757376472021654 0ustar00runnerdocker from itertools import islice import pytest import numpy as np from extra_data import ( H5File, KeyData, by_index, by_id, AliasError, SourceNameError, PropertyNameError ) def test_with_aliases(mock_sa3_control_data, mock_sa3_control_aliases): run_without_aliases = H5File(mock_sa3_control_data) run = run_without_aliases.with_aliases(mock_sa3_control_aliases) def assert_equal_keydata(kd1, kd2): assert isinstance(kd1, KeyData) assert isinstance(kd2, KeyData) assert kd1.source == kd2.source assert kd1.key == kd2.key assert kd1.train_ids == kd2.train_ids # Test whether source alias yields identical SourceData. assert run.alias['sa3-xgm'] is run['SA3_XTD10_XGM/XGM/DOOCS'] # Test alternative capitalisation and _ instead of - assert run.alias['SA3_XGM'] is run['SA3_XTD10_XGM/XGM/DOOCS'] # Test __contains__() assert "sa3-xgm" in run.alias assert not "sa42-xgm" in run.alias with pytest.raises(TypeError): 42 in run.alias # Test whether source alias plus literal key yields equal KeyData. assert_equal_keydata( run.alias['sa3-xgm', 'pulseEnergy.wavelengthUsed'], run['SA3_XTD10_XGM/XGM/DOOCS', 'pulseEnergy.wavelengthUsed']) # Test whether key alias yields equal KeyData. assert_equal_keydata( run.alias['hv'], run['SA3_XTD10_XGM/XGM/DOOCS', 'pulseEnergy.wavelengthUsed']) # Test undefined aliases. with pytest.raises(AliasError): run.alias['foo'] run.alias['foo', 'bar'] # Test using a literal key with a key alias. with pytest.raises(ValueError): run.alias['hv', 'pressure'] # Test using an existing source alias for a non-existing source. with pytest.raises(SourceNameError): run.alias['bogus-source'] # Test using an existing key alias for a non-existing key. with pytest.raises(PropertyNameError): run.alias['bogus-key'] # Test re-applying the same aliases. run2 = run.with_aliases(mock_sa3_control_aliases) assert run._aliases == run2._aliases # Test adding additional aliases. run3 = run.with_aliases({'foo': 'bar'}) assert set(run._aliases.keys()) < set(run3._aliases.keys()) assert 'foo' in run3._aliases # Test adding conflicting aliases with pytest.raises(ValueError): run.with_aliases({'sa3-xgm': 'x'}) # Test dropping aliases again. run4 = run.drop_aliases() assert not run4._aliases # Smoke tests for __str__() and __repr__() assert "Loaded aliases" in repr(run.alias) assert "No aliases" in repr(run_without_aliases.alias) str(run.alias) def test_alias_clash(mock_sa3_control_data, mock_sa3_control_aliases): run_without_aliases = H5File(mock_sa3_control_data) # The aliases include 'mcp-adc' - test with an equivalent name with pytest.raises(ValueError, match='conflicting alias'): mock_sa3_control_aliases.update({'MCP_ADC': 'SA3_XTD10_MCP/ADC/2'}) run_without_aliases.with_aliases(mock_sa3_control_aliases) def test_json_alias_file(mock_sa3_control_data, mock_sa3_control_aliases, tmp_path): aliases_path = tmp_path / 'aliases.json' aliases_path.write_text(''' { "sa3-xgm": "SA3_XTD10_XGM/XGM/DOOCS", "SA3_XTD10_XGM/XGM/DOOCS": { "hv": "pulseEnergy.wavelengthUsed", "beam-x": "beamPosition.ixPos", "beam-y": "beamPosition.iyPos" }, "imgfel-frames": ["SA3_XTD10_IMGFEL/CAM/BEAMVIEW:daqOutput", "data.image.pixels"], "imgfel-frames2": ["SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput", "data.image.pixels"], "imgfel-screen-pos": ["SA3_XTD10_IMGFEL/MOTOR/SCREEN", "actualPosition"], "imgfel-filter-pos": ["SA3_XTD10_IMGFEL/MOTOR/FILTER", "actualPosition"], "mcp-adc": "SA3_XTD10_MCP/ADC/1", "mcp-mpod": "SA3_XTD10_MCP/MCPS/MPOD", "mcp-voltage": ["SA3_XTD10_MCP/MCPS/MPOD", "channels.U3.voltage"], "mcp-trace": ["SA3_XTD10_MCP/ADC/1:channel_5.output", "data.rawData"], "bogus-source": "SA4_XTD20_XGM/XGM/DOOCS", "bogus-key": ["SA3_XTD10_XGM/XGM/DOOCS", "foo"] } ''') run = H5File(mock_sa3_control_data).with_aliases(aliases_path) assert run._aliases == mock_sa3_control_aliases def test_yaml_alias_file(mock_sa3_control_data, mock_sa3_control_aliases, tmp_path): aliases_path = tmp_path / 'aliases.yaml' aliases_path.write_text(''' sa3-xgm: SA3_XTD10_XGM/XGM/DOOCS SA3_XTD10_XGM/XGM/DOOCS: hv: pulseEnergy.wavelengthUsed beam-x: beamPosition.ixPos beam-y: beamPosition.iyPos imgfel-frames: [SA3_XTD10_IMGFEL/CAM/BEAMVIEW:daqOutput, data.image.pixels] imgfel-frames2: [SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput, data.image.pixels] imgfel-screen-pos: [SA3_XTD10_IMGFEL/MOTOR/SCREEN, actualPosition] imgfel-filter-pos: [SA3_XTD10_IMGFEL/MOTOR/FILTER, actualPosition] # Will be normalised to mcp-adc MCP_ADC: SA3_XTD10_MCP/ADC/1 mcp-mpod: SA3_XTD10_MCP/MCPS/MPOD mcp-voltage: [SA3_XTD10_MCP/MCPS/MPOD, channels.U3.voltage] mcp-trace: [SA3_XTD10_MCP/ADC/1:channel_5.output, data.rawData] bogus-source: SA4_XTD20_XGM/XGM/DOOCS bogus-key: [SA3_XTD10_XGM/XGM/DOOCS, foo] ''') run = H5File(mock_sa3_control_data).with_aliases(aliases_path) assert run._aliases == mock_sa3_control_aliases def test_toml_alias_file(mock_sa3_control_data, mock_sa3_control_aliases, tmp_path): aliases_path = tmp_path / 'aliases.toml' aliases_path.write_text(''' sa3-xgm = "SA3_XTD10_XGM/XGM/DOOCS" imgfel-frames = ["SA3_XTD10_IMGFEL/CAM/BEAMVIEW:daqOutput", "data.image.pixels"] imgfel-frames2 = ["SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput", "data.image.pixels"] imgfel-screen-pos = ["SA3_XTD10_IMGFEL/MOTOR/SCREEN", "actualPosition"] imgfel-filter-pos = ["SA3_XTD10_IMGFEL/MOTOR/FILTER", "actualPosition"] mcp-adc = "SA3_XTD10_MCP/ADC/1" mcp-mpod = "SA3_XTD10_MCP/MCPS/MPOD" mcp-voltage = ["SA3_XTD10_MCP/MCPS/MPOD", "channels.U3.voltage"] mcp-trace = ["SA3_XTD10_MCP/ADC/1:channel_5.output", "data.rawData"] bogus-source = "SA4_XTD20_XGM/XGM/DOOCS" bogus-key = ["SA3_XTD10_XGM/XGM/DOOCS", "foo"] ["SA3_XTD10_XGM/XGM/DOOCS"] hv = "pulseEnergy.wavelengthUsed" beam-x = "beamPosition.ixPos" beam-y = "beamPosition.iyPos" ''') run = H5File(mock_sa3_control_data).with_aliases(aliases_path) assert run._aliases == mock_sa3_control_aliases def test_only_aliases(mock_sa3_control_data, mock_sa3_control_aliases): run = H5File(mock_sa3_control_data).with_aliases(mock_sa3_control_aliases) subrun = H5File(mock_sa3_control_data).only_aliases(mock_sa3_control_aliases) # Assume that aliases work when the _aliases property is equal. assert run._aliases == subrun._aliases # Test whether only the sources used in aliases are present. assert subrun.all_sources == { 'SA3_XTD10_XGM/XGM/DOOCS', 'SA3_XTD10_IMGFEL/CAM/BEAMVIEW:daqOutput', 'SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'SA3_XTD10_IMGFEL/MOTOR/SCREEN', 'SA3_XTD10_IMGFEL/MOTOR/FILTER', 'SA3_XTD10_MCP/ADC/1', 'SA3_XTD10_MCP/MCPS/MPOD', 'SA3_XTD10_MCP/ADC/1:channel_5.output', } # Test whether all keys are present for an aliased source. assert subrun['SA3_XTD10_XGM/XGM/DOOCS'].keys() == run['SA3_XTD10_XGM/XGM/DOOCS'].keys() # Test whether all keys are present for an aliased source, even if # there are key aliases for it as well. assert subrun['SA3_XTD10_MCP/MCPS/MPOD'].keys() == run['SA3_XTD10_MCP/MCPS/MPOD'].keys() # Test whether only aliased keys are present for unaliased sources. assert subrun['SA3_XTD10_IMGFEL/MOTOR/SCREEN'].keys() == {'actualPosition.value'} # Test strict selection. with pytest.raises(ValueError): H5File(mock_sa3_control_data).only_aliases( mock_sa3_control_aliases, strict=True) # Remove bogus aliases and test strict selection again. strict_aliases = mock_sa3_control_aliases.copy() del strict_aliases['bogus-source'] del strict_aliases['bogus-key'] H5File(mock_sa3_control_data).only_aliases(strict_aliases, strict=True) # Prepare a run with less trains for a single source # (SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput) by removing all sources # without any trains. run = H5File(mock_sa3_control_data) \ .deselect([('SA3_XTD10_MCP/ADC/1:*', '*'), ('SA3_XTD10_IMGFEL/CAM/BEAMVIEW:*', '*')]) del strict_aliases['mcp-trace'] del strict_aliases['imgfel-frames'] # Without strict alias selection and a bogus alias. subrun = run.only_aliases(mock_sa3_control_aliases, require_all=True, strict=False) np.testing.assert_array_equal(subrun.train_ids, run.train_ids[1::2]) # With strict alias selection. subrun = run.only_aliases(strict_aliases, require_all=True, strict=True) np.testing.assert_array_equal(subrun.train_ids, run.train_ids[1::2]) def test_preserve_aliases(mock_sa3_control_data, mock_sa3_control_aliases): run = H5File(mock_sa3_control_data).with_aliases(mock_sa3_control_aliases) # Test whether selection operations preserve aliases. assert run.select_trains(by_index[:5])._aliases == run._aliases assert run.select_trains(by_id[run.train_ids[:5]])._aliases == run._aliases assert run.select('*')._aliases == run._aliases assert run.deselect('*XGM*')._aliases == run._aliases assert all([subrun._aliases == run._aliases for subrun in run.split_trains(parts=5)]) def test_aliases_union(mock_sa3_control_data, mock_sa3_control_aliases): run = H5File(mock_sa3_control_data).with_aliases(mock_sa3_control_aliases) # Split the aliases into two halves and test the union. run1 = run.with_aliases(dict(islice(mock_sa3_control_aliases.items(), 0, None, 2))) run2 = run.with_aliases(dict(islice(mock_sa3_control_aliases.items(), 1, None, 2))) assert run1.union(run2)._aliases == mock_sa3_control_aliases # Split the run into two. even_run = run.select_trains(by_id[run.train_ids[0::2]]) odd_run = run.select_trains(by_id[run.train_ids[1::2]]) # Test overlapping aliases with no conflict. even_run.union(odd_run) # Test conflicting aliases. conflicting_aliases = mock_sa3_control_aliases.copy() conflicting_aliases['hv'] = ('SA3_XTD10_XGM/XGM/DOOCS', 'pressure.pressure1') with pytest.raises(ValueError): even_run.union(odd_run.with_aliases(conflicting_aliases)) def test_alias_select(mock_sa3_control_data, mock_sa3_control_aliases): run = H5File(mock_sa3_control_data).with_aliases(mock_sa3_control_aliases) # Only source alias. subrun = run.alias.select('sa3-xgm') assert subrun.all_sources == {'SA3_XTD10_XGM/XGM/DOOCS'} assert subrun.alias['sa3-xgm'].keys() == run.alias['sa3-xgm'].keys() # Source alias and key glob. subrun = run.alias.select('sa3-xgm', 'pressure.pressure*.value') assert subrun.all_sources == {'SA3_XTD10_XGM/XGM/DOOCS'} assert subrun.alias['sa3-xgm'].keys() == { 'pressure.pressure1.value', 'pressure.pressureFiltered.value'} # Iterable of aliases and/or with key globs. subrun = run.alias.select([('sa3-xgm', 'pressure.pressure*.value'), 'beam-x', 'mcp-voltage']) assert subrun.all_sources == {'SA3_XTD10_XGM/XGM/DOOCS', 'SA3_XTD10_MCP/MCPS/MPOD'} assert subrun.alias['sa3-xgm'].keys() == { 'pressure.pressure1.value', 'pressure.pressureFiltered.value', 'beamPosition.ixPos.value'} assert subrun.alias['mcp-mpod'].keys() == {'channels.U3.voltage.value'} # Dictionary subrun = run.alias.select({'sa3-xgm': None, 'mcp-mpod': {'channels.U1.voltage'}}) assert subrun.all_sources == {'SA3_XTD10_XGM/XGM/DOOCS', 'SA3_XTD10_MCP/MCPS/MPOD'} assert subrun.alias['sa3-xgm'].keys() == run.alias['sa3-xgm'].keys() assert subrun.alias['mcp-mpod'].keys() == {'channels.U1.voltage.value'} def test_alias_deselect(mock_sa3_control_data, mock_sa3_control_aliases): run = H5File(mock_sa3_control_data).with_aliases(mock_sa3_control_aliases) # De-select via alias. subrun = run.alias.deselect([ ('sa3-xgm', 'pressure.*'), ('sa3-xgm', 'current.*'), ('sa3-xgm', 'gasDosing.*'), ('sa3-xgm', 'gasSupply.*'), ('sa3-xgm', 'pressure.*'), ('sa3-xgm', 'pulseEnergy.*'), ('sa3-xgm', 'signalAdaption.*') ]) assert subrun.all_sources == run.all_sources assert subrun.alias['sa3-xgm'].keys() == { 'beamPosition.ixPos.value', 'beamPosition.ixPos.timestamp', 'beamPosition.iyPos.value', 'beamPosition.iyPos.timestamp', 'pollingInterval.value', 'pollingInterval.timestamp'} ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_bad_trains.py0000644000175100001660000002510614757376472022340 0ustar00runnerdockerimport os.path as osp from tempfile import TemporaryDirectory import h5py import numpy as np import pytest from testpath import assert_isfile from extra_data import H5File from extra_data.components import AGIPD1M from extra_data.exceptions import TrainIDError from extra_data.file_access import FileAccess from . import make_examples @pytest.fixture(scope='module') def agipd_file_tid_very_high(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_example_file(path, format_version='0.5') with h5py.File(path, 'r+') as f: # Initial train IDs are np.arange(10000, 10250) f['INDEX/trainId'][10] = 10400 yield path @pytest.fixture(scope='module') def agipd_file_tid_high(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_file(path, format_version='0.5') with h5py.File(path, 'r+') as f: # Initial train IDs are np.arange(10000, 10486), this will appear 2x f['INDEX/trainId'][10] = 10100 yield path @pytest.fixture(scope='module') def agipd_file_tid_low(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_example_file(path, format_version='0.5') with h5py.File(path, 'r+') as f: # Initial train IDs are np.arange(10000, 10250) f['INDEX/trainId'][20] = 9000 yield path @pytest.fixture() def agipd_file_flag0(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_file(path, format_version='1.0') with h5py.File(path, 'r+') as f: f['INDEX/flag'][30] = 0 yield path def test_guess_validity(agipd_file_tid_very_high, agipd_file_tid_high, agipd_file_tid_low): fa = FileAccess(agipd_file_tid_very_high) assert fa.validity_flag.sum() == 249 assert not fa.validity_flag[10] fa = FileAccess(agipd_file_tid_high) assert fa.validity_flag.sum() == 485 assert not fa.validity_flag[10] fa = FileAccess(agipd_file_tid_low) assert fa.validity_flag.sum() == 249 assert not fa.validity_flag[20] def test_validity_flag(agipd_file_flag0): fa = FileAccess(agipd_file_flag0) assert fa.validity_flag.sum() == 485 assert not fa.validity_flag[30] def test_exc_trainid(agipd_file_tid_very_high, agipd_file_tid_high, agipd_file_tid_low, agipd_file_flag0): f = H5File(agipd_file_tid_very_high, inc_suspect_trains=False) assert len(f.train_ids) == 249 assert 10400 not in f.train_ids f = H5File(agipd_file_tid_very_high, inc_suspect_trains=True) assert len(f.train_ids) == 250 assert 10400 in f.train_ids f = H5File(agipd_file_tid_high, inc_suspect_trains=False) assert len(f.train_ids) == 485 assert 10100 in f.train_ids with pytest.raises(ValueError): H5File(agipd_file_tid_high, inc_suspect_trains=True) f = H5File(agipd_file_tid_low, inc_suspect_trains=False) assert len(f.train_ids) == 249 assert 9000 not in f.train_ids f = H5File(agipd_file_tid_low, inc_suspect_trains=True) assert len(f.train_ids) == 250 assert 9000 in f.train_ids f = H5File(agipd_file_flag0, inc_suspect_trains=False) assert len(f.train_ids) == 485 assert 10030 not in f.train_ids f = H5File(agipd_file_flag0, inc_suspect_trains=True) assert len(f.train_ids) == 486 assert 10030 in f.train_ids # If the tests above pass, the invalid trains in the different sample files # are being recognised correctly. So for the tests below, we'll mainly test # each behaviour on just one of the sample files. def test_keydata_interface(agipd_file_tid_very_high): f = H5File(agipd_file_tid_very_high, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.data'] assert len(kd.train_ids) == 249 assert kd.shape == (249 * 64, 512, 128) f = H5File(agipd_file_tid_very_high, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.data'] assert len(kd.train_ids) == 250 assert kd.shape == (250 * 64, 512, 128) # Check selecting trains preserves inc_suspect_trains flag assert kd[:].shape == (250 * 64, 512, 128) def test_data_counts(agipd_file_flag0): f = H5File(agipd_file_flag0, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] assert 10030 not in kd.data_counts().index f = H5File(agipd_file_flag0, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] assert 10030 in kd.data_counts().index def test_array(agipd_file_tid_low): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) arr = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'].xarray() assert arr.shape == (249 * 64, 1) f = H5File(agipd_file_tid_low, inc_suspect_trains=True) arr = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'].xarray() assert arr.shape == (250 * 64, 1) def test_array_dup(agipd_file_tid_high): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) arr = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'].xarray() assert arr.shape == (485 * 64, 1) assert list(arr.coords['trainId'].values[(9*64):(11*64):64]) == [10009, 10011] # Can't open files with duplicate train IDs using inc_suspect_trains=True def test_dask_array(agipd_file_flag0): f = H5File(agipd_file_flag0, inc_suspect_trains=False) arr = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'].dask_array() assert arr.shape == (485 * 64, 1) f = H5File(agipd_file_flag0, inc_suspect_trains=True) arr = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'].dask_array() assert arr.shape == (486 * 64, 1) def test_iterate_keydata(agipd_file_tid_very_high): f = H5File(agipd_file_tid_very_high, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] tids = [t for (t, _) in kd.trains()] assert len(tids) == 249 assert 10400 not in tids f = H5File(agipd_file_tid_very_high, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] tids = [t for (t, _) in kd.trains()] assert len(tids) == 250 assert 10400 in tids def test_iterate_keydata_dup(agipd_file_tid_high): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'] tids = [t for (t, _) in kd.trains()] assert len(tids) == 485 assert 10100 in tids assert tids[9:11] == [10009, 10011] def test_iterate_datacollection(agipd_file_tid_low): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) tids = [t for (t, _) in f.trains()] assert len(tids) == 249 assert 9000 not in tids def test_get_train_keydata(agipd_file_tid_low): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] with pytest.raises(TrainIDError): kd.train_from_id(9000) f = H5File(agipd_file_tid_low, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] assert kd.train_from_id(9000)[0] == 9000 def test_components_array(agipd_file_flag0): f = H5File(agipd_file_flag0, inc_suspect_trains=False) agipd = AGIPD1M(f, modules=[0]) arr = agipd.get_array('image.data', pulses=np.s_[:1]) assert arr.shape == (1, 485, 1, 2, 512, 128) assert arr.dims == ('module', 'train', 'pulse', 'data_gain', 'slow_scan', 'fast_scan') def test_components_array_dup(agipd_file_tid_high): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) agipd = AGIPD1M(f, modules=[0]) arr = agipd.get_array('image.data', pulses=np.s_[:1]) assert arr.shape == (1, 485, 1, 2, 512, 128) assert arr.dims == ('module', 'train', 'pulse', 'data_gain', 'slow_scan', 'fast_scan') assert list(arr.coords['train'].values[9:11]) == [10009, 10011] def test_write_virtual_cxi_dup(agipd_file_tid_high, tmp_path, caplog): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) agipd = AGIPD1M(f, modules=[0]) cxi_path = tmp_path / 'exc_suspect.cxi' agipd.write_virtual_cxi(str(cxi_path)) assert_isfile(cxi_path) with h5py.File(cxi_path, 'r') as f: assert f['entry_1/data_1/data'].shape == (485 * 64, 16, 2, 512, 128) def test_write_virtual(agipd_file_tid_low, agipd_file_tid_high, tmp_path): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) f.write_virtual(tmp_path / 'low.h5') with h5py.File(tmp_path / 'low.h5', 'r') as vf: assert 9000 not in vf['INDEX/trainId'][:] ds = vf['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/pulseId'] assert ds.shape == (249 * 64, 1) f = H5File(agipd_file_tid_high, inc_suspect_trains=False) f.write_virtual(tmp_path / 'high.h5') with h5py.File(tmp_path / 'high.h5', 'r') as vf: ds = vf['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/trainId'] assert ds.shape == (485 * 64, 1) assert list(ds[(9*64):(11*64):64]) == [10009, 10011] def test_still_valid_elsewhere(agipd_file_tid_very_high, mock_sa3_control_data): dc = H5File( agipd_file_tid_very_high, inc_suspect_trains=False ).union(H5File(mock_sa3_control_data)) assert dc.train_ids == list(range(10000, 10500)) agipd_src = 'SPB_DET_AGIPD1M-1/DET/7CH0:xtdf' tsens_src = 'SA3_XTD10_VAC/TSENS/S30250K' sel = dc.select({ agipd_src: {'image.pulseId'}, tsens_src: {'value.value'} }) assert sel.all_sources == {agipd_src, tsens_src} _, t1 = sel.train_from_id(10200, flat_keys=True) assert set(t1) >= {(agipd_src, 'image.pulseId'), (tsens_src, 'value.value')} _, t2 = sel.train_from_id(10400, flat_keys=True) assert (agipd_src, 'image.pulseId') not in t2 assert (tsens_src, 'value.value') in t2 tids_from_iter, data_from_iter = [], [] for tid, d in sel.trains(flat_keys=True): if tid in (10200, 10400): tids_from_iter.append(tid) data_from_iter.append(d) assert tids_from_iter == [10200, 10400] assert [set(d) for d in data_from_iter] == [set(t1), set(t2)] # Check that select with require_all respects the valid train filtering: sel2 = dc.select(agipd_src, require_all=True) assert len(sel2.train_ids) == 249 dc_inc = H5File(agipd_file_tid_very_high, inc_suspect_trains=True)\ .union(H5File(mock_sa3_control_data)) sel_inc = dc_inc.select(sel) _, t2_inc = sel_inc.train_from_id(10400, flat_keys=True) assert set(t2_inc) == set(t1) sel2_inc = dc_inc.select(agipd_src, require_all=True) assert len(sel2_inc.train_ids) == 250 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_components.py0000644000175100001660000006760014757376472022424 0ustar00runnerdockerimport dask.array as da import h5py import numpy as np import os.path as osp import pytest from testpath import assert_isfile from extra_data.reader import RunDirectory, H5File, by_id, by_index from extra_data.components import ( AGIPD1M, DSSC1M, LPD1M, JUNGFRAU, identify_multimod_detectors, ) def test_get_array(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data') assert arr.dtype == np.uint16 assert arr.shape == (16, 3, 128, 256, 256) assert arr.dims == ('module', 'train', 'pulse', 'slow_scan', 'fast_scan') arr = det.get_array('image.data', pulses=by_index[:10], unstack_pulses=False) assert arr.shape == (16, 30, 256, 256) assert arr.dtype == np.uint16 assert arr.dims == ('module', 'train_pulse', 'slow_scan', 'fast_scan') # fill value with pytest.raises(ValueError): det.get_array('image.data', fill_value=np.nan) arr = det.get_array('image.data', astype=np.float32) assert arr.dtype == np.float32 def test_get_array_pulse_id(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_id[0]) assert arr.shape == (16, 3, 1, 256, 256) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_id[:5]) assert arr.shape == (16, 3, 5, 256, 256) # Empty selection arr = det.get_array('image.data', pulses=by_id[:0]) assert arr.shape == (16, 0, 0, 256, 256) arr = det.get_array('image.data', pulses=by_id[122:]) assert arr.shape == (16, 3, 6, 256, 256) arr = det.get_array('image.data', pulses=by_id[[1, 7, 22, 23]]) assert arr.shape == (16, 3, 4, 256, 256) assert list(arr.coords['pulse']) == [1, 7, 22, 23] def test_get_array_with_cell_ids(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', subtrain_index='cellId') assert arr.shape == (16, 3, 128, 256, 256) assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') arr = det.get_array('image.data', pulses=by_id[0], subtrain_index='cellId') assert arr.shape == (16, 3, 1, 256, 256) assert (arr.coords['cell'] == 0).all() def test_get_array_pulse_indexes(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_index[0]) assert arr.shape == (16, 3, 1, 256, 256) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_index[:5]) assert arr.shape == (16, 3, 5, 256, 256) # Empty selection arr = det.get_array('image.data', pulses=by_index[:0]) assert arr.shape == (16, 0, 0, 256, 256) arr = det.get_array('image.data', pulses=by_index[122:]) assert arr.shape == (16, 3, 6, 256, 256) arr = det.get_array('image.data', pulses=by_index[[1, 7, 22, 23]]) assert arr.shape == (16, 3, 4, 256, 256) def test_get_array_pulse_id_reduced_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_id[0]) assert arr.shape == (16, 3, 1, 512, 128) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_id[:5]) assert (arr.coords['pulse'] < 5).all() # Empty selection arr = det.get_array('image.data', pulses=by_id[:0]) assert arr.shape == (16, 0, 0, 512, 128) arr = det.get_array('image.data', pulses=by_id[5:]) assert (arr.coords['pulse'] >= 5).all() arr = det.get_array('image.data', pulses=by_id[[1, 7, 15, 23]]) assert np.isin(arr.coords['pulse'], [1, 7, 15, 23]).all() def test_get_array_pulse_indexes_reduced_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_index[0]) assert arr.shape == (16, 3, 1, 512, 128) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_index[:5]) assert (arr.coords['pulse'] < 5).all() # Empty selection arr = det.get_array('image.data', pulses=by_index[:0]) assert arr.shape == (16, 0, 0, 512, 128) arr = det.get_array('image.data', pulses=np.s_[5:]) assert (arr.coords['pulse'] >= 5).all() arr = det.get_array('image.data', pulses=by_index[[1, 7, 15, 23]]) assert np.isin(arr.coords['pulse'], [1, 7, 15, 23]).all() arr = det.get_array('image.data', pulses=[1, 7, 15, 23]) assert np.isin(arr.coords['pulse'], [1, 7, 15, 23]).all() def test_get_array_gap(mock_lpd_mini_gap_run): run = RunDirectory(mock_lpd_mini_gap_run) det = LPD1M(run, modules=[0, 1]) # All pulses arr = det.get_array('image.data') assert arr.shape == (2, 5, 10, 256, 256) np.testing.assert_array_equal(arr[1, :, 8, 0, 0], [1, 2, 0, 3, 4]) # Selected pulses arr = det.get_array('image.data', pulses=[8]) assert arr.shape == (2, 5, 1, 256, 256) np.testing.assert_array_equal(arr[1, :, 0, 0, 0], [1, 2, 0, 3, 4]) def test_get_array_roi(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data', roi=np.s_[10:60, 100:200]) assert arr.shape == (16, 3, 128, 50, 100) assert arr.dims == ('module', 'train', 'pulse', 'slow_scan', 'fast_scan') def test_get_array_roi_dssc(mock_scs_run): run = RunDirectory(mock_scs_run) det = DSSC1M(run, modules=[3]) arr = det.get_array('image.data', roi=np.s_[20:25, 40:52]) assert arr.shape == (1, 128, 64, 5, 12) def test_ndarray_module_gaps(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run, modules=[2]).select_trains(np.s_[:3]) det_data = det['image.data'] assert det_data.shape == (1, 128 * 3, 256, 256) assert det_data.ndarray().shape == (1, 128 * 3, 256, 256) arr_w_gaps = det_data.ndarray(module_gaps=True, fill_value=7) assert arr_w_gaps.shape == (16, 128 * 3, 256, 256) assert arr_w_gaps[:, 0, 0, 0].tolist() == ([7] * 2) + [0] + ([7] * 13) def test_get_array_lpd_parallelgain(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:2]), parallel_gain=True) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data') assert arr.shape == (16, 2, 3, 100, 256, 256) assert arr.dims == ('module', 'train', 'gain', 'pulse', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['gain'], np.arange(3)) np.testing.assert_array_equal(arr.coords['pulse'], np.arange(100)) def test_get_array_lpd_parallelgain_select_pulses(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:2]), parallel_gain=True) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data', pulses=np.s_[:5]) assert arr.shape == (16, 2, 3, 5, 256, 256) assert arr.dims == ('module', 'train', 'gain', 'pulse', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['gain'], np.arange(3)) np.testing.assert_array_equal(arr.coords['pulse'], np.arange(5)) arr = det.get_array('image.data', pulses=by_id[:5]) assert arr.shape == (16, 2, 3, 5, 256, 256) np.testing.assert_array_equal(arr.coords['pulse'], np.arange(5)) def test_get_array_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run.select_trains(by_index[:2])) assert jf.detector_name == 'SPB_IRDA_JF4M' arr = jf.get_array('data.adc') assert arr.shape == (8, 2, 16, 512, 1024) assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['train'], [10000, 10001]) arr = jf.get_array('data.adc', astype=np.float32) assert arr.dtype == np.float32 assert jf['data.adc'].shape == (8, 2, 16, 512, 1024) assert jf['data.adc'].buffer_shape( module_gaps=True, roi=np.s_[:, :25, :35] ) == (8, 2, 16, 25, 35) def test_jungfraus_first_modno(mock_jungfrau_run, mock_fxe_jungfrau_run): # Test SPB_IRDA_JF4M component by setting the first_modno to the default value 1. run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run.select_trains(by_index[:2]), first_modno=1) assert jf.detector_name == 'SPB_IRDA_JF4M' assert jf.n_modules == 8 arr = jf.get_array('data.adc') assert np.all(arr['module'] == list(range(1, 9))) # Test FXE_XAD_JF500K component with and without setting first_modno to 3. for first_modno, modno in zip([1, 3], [3, 1]): run = RunDirectory(mock_fxe_jungfrau_run) jf = JUNGFRAU( run.select_trains(by_index[:2]), detector_name='FXE_XAD_JF500K', first_modno=first_modno, ) assert jf.detector_name == 'FXE_XAD_JF500K' assert jf.n_modules == modno arr = jf.get_array('data.adc') assert np.all(arr['module'] == [modno]) def test_jungfrau_masked_data(mock_fxe_jungfrau_run): run = RunDirectory(mock_fxe_jungfrau_run) jf = JUNGFRAU(run, 'FXE_XAD_JF500K') # Default options kd = jf.masked_data().select_trains(np.s_[:1]) arr = kd.ndarray() assert arr.shape == (1, 1, 16, 512, 1024) assert arr.dtype == np.float32 line0 = np.zeros(1024, dtype=np.float32) line0[1:32] = np.nan np.testing.assert_array_equal(arr[0, 0, 0, 0, :], line0) # Xarray xarr = kd.xarray() assert xarr.dims[:2] == ('module', 'trainId') np.testing.assert_array_equal(xarr.values[0, 0, 0, 0, :], line0) # Specify which mask bits to use, & replace masked values with 99 kd = jf.masked_data(mask_bits=1, masked_value=99).select_trains(np.s_[:1]) arr = kd.ndarray() assert arr.shape == (1, 1, 16, 512, 1024) line0 = np.zeros(1024, dtype=np.float32) line0[1:32:2] = 99 np.testing.assert_array_equal(arr[0, 0, 0, 0, :], line0) # Different field kd = jf.masked_data('data.gain', masked_value=255).select_trains(np.s_[:1]) arr = kd.ndarray() assert arr.shape == (1, 1, 16, 512, 1024) assert arr.dtype == np.uint8 line0 = np.zeros(1024, dtype=np.uint8) line0[1:32] = 255 np.testing.assert_array_equal(arr[0, 0, 0, 0, :], line0) def test_xtdf_masked_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) agipd = AGIPD1M(run, modules=[8, 9]) kd = agipd.masked_data().select_trains(np.s_[:1]) assert kd.shape == (2, kd.shape[1], 512, 128) arr = kd.ndarray() assert arr.shape == kd.shape assert arr.dtype == np.float32 line0_2mod = np.zeros((2, 128), dtype=np.float32) line0_2mod[1, 1:32] = np.nan np.testing.assert_array_equal(arr[:, 0, 0, :], line0_2mod) # Test with pulse selection (frames per train is consistent but arbitrary) kd_pulse_sel = kd.select_pulses(np.s_[:3]) assert kd_pulse_sel.shape[1] <= 3 assert kd_pulse_sel.ndarray().shape == kd_pulse_sel.shape kd = agipd.masked_data(mask_bits=[1, 4], masked_value=-1).select_trains(np.s_[:1]) arr = kd.ndarray() line0_2mod = np.zeros((2, 128), dtype=np.float32) line0_2mod[1, np.nonzero(np.arange(32) & 5)] = -1 np.testing.assert_array_equal(arr[:, 0, 0, :], line0_2mod) def test_masked_data_raw_error(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) lpd = LPD1M(run) with pytest.raises(RuntimeError, match="image.mask"): lpd.masked_data() def test_get_dask_array(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run) arr = det.get_dask_array('image.data', fill_value=42) assert isinstance(arr.data, da.Array) assert arr.shape == (16, 480 * 128, 1, 256, 256) assert arr.dtype == np.uint16 assert arr.dims == ('module', 'train_pulse', 'dim_0', 'dim_1', 'dim_2') np.testing.assert_array_equal(arr.coords['module'], np.arange(16)) np.testing.assert_array_equal( arr.coords['trainId'], np.repeat(np.arange(10000, 10480), 128) ) np.testing.assert_array_equal( arr.coords['pulseId'], np.tile(np.arange(0, 128), 480) ) arr_cellid = det.get_dask_array('image.data', subtrain_index='cellId') assert arr_cellid.coords['cellId'].shape == (480 * 128,) def test_get_dask_array_reduced_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run) arr = det.get_dask_array('image.data') assert arr.shape[2:] == (512, 128) assert arr.dims == ('module', 'train_pulse', 'dim_0', 'dim_1') np.testing.assert_array_equal(arr.coords['module'], np.arange(16)) assert np.isin(arr.coords['trainId'], np.arange(10000, 10480)).all() assert np.isin(arr.coords['pulseId'], np.arange(0, 20)).all() def test_get_dask_array_lpd_parallelgain(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:2]), parallel_gain=True) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_dask_array('image.data') assert arr.shape == (16, 2 * 3 * 100, 1, 256, 256) assert arr.dims[:2] == ('module', 'train_pulse') np.testing.assert_array_equal(arr.coords['pulseId'], np.tile(np.arange(100), 6)) def test_get_dask_array_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run) assert jf.detector_name == 'SPB_IRDA_JF4M' arr = jf.get_dask_array('data.adc') assert arr.shape == (8, 100, 16, 512, 1024) assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['train'], np.arange(10000, 10100)) def test_data_availability_lpd_gap(mock_lpd_mini_gap_run): run = RunDirectory(mock_lpd_mini_gap_run) det = LPD1M(run, modules=[0, 1]) # This example just contains 2 modules av = det.data_availability() assert av.shape == (2, 50) np.testing.assert_array_equal(av[1, 20:30], False) assert av.sum() == 2 * 50 - 10 av_gaps = det.data_availability(module_gaps=True) assert av_gaps.shape == (16, 50) np.testing.assert_array_equal(av_gaps[2:], False) assert av_gaps.sum() == 2 * 50 - 10 def test_pulse_id_cell_id(mock_lpd_mini_gap_run): run = RunDirectory(mock_lpd_mini_gap_run) det = LPD1M(run, modules=[0, 1]) # This example just contains 2 modules kd = det['image.data'] np.testing.assert_array_equal( kd.pulse_id_coordinates(), np.tile(np.arange(10), 5) ) np.testing.assert_array_equal( kd.cell_id_coordinates(), np.tile(np.arange(10), 5) ) def test_pulse_id_cell_id_reduced(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run) kd = det['image.data'] nframes = kd.shape[1] # The selected frames are random, so we don't know precisely the pattern assert kd.train_id_coordinates().shape == (nframes,) assert kd.pulse_id_coordinates().shape == (nframes,) assert kd.cell_id_coordinates().shape == (nframes,) def test_jungfrau_cell_ids(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) det = JUNGFRAU(run) cellids = det.cell_ids() assert cellids.shape == (16,) def test_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(np.s_[:20])) assert len(det.train_ids) == 20 det = det.select_trains(np.s_[:2]) assert len(det.train_ids) == 2 arr = det.get_array('image.data') assert arr.shape == (16, 2, 128, 256, 256) def test_keydata_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(np.s_[:20])) kd = det['image.data'] assert len(kd.train_ids) == 20 assert kd.shape == (16, 20 * 128, 256, 256) kd = kd[:3] assert len(kd.train_ids) == 3 assert kd.shape == (16, 3 * 128, 256, 256) with pytest.raises(TypeError): iter(kd) def test_split_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(np.s_[:20])) assert len(det.train_ids) == 20 parts = list(det.split_trains(parts=4)) assert len(parts) == 4 assert [len(p.train_ids) for p in parts] == [5, 5, 5, 5] assert sum([p.train_ids for p in parts], []) == det.train_ids arr = parts[-1].get_array('image.data', pulses=np.s_[:1]) assert arr.shape == (16, 5, 1, 256, 256) # Split by a number of frames parts = list(det.split_trains(frames_per_part=256)) assert [len(p.train_ids) for p in parts] == [2] * 10 # frames_per_part less than one train (128 frames in this example) parts = list(det.split_trains(frames_per_part=100)) assert [len(p.train_ids) for p in parts] == [1] * 20 # trains_per_part cuts off before frames_per_part parts = list(det.split_trains(trains_per_part=3, frames_per_part=1024)) assert [len(p.train_ids) for p in parts] == ([3] * 6) + [2] # parts cuts off before frames_per_part parts = list(det.split_trains(parts=6, frames_per_part=1024)) assert [len(p.train_ids) for p in parts] == ([3] * 6) + [2] # frames_per_part > all frames in selection parts = list(det.split_trains(frames_per_part=3000)) assert [len(p.train_ids) for p in parts] == [20] def test_split_trains_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run.select_trains(np.s_[:20])) assert jf.frames_per_train == 16 parts = list(jf.split_trains(frames_per_part=64)) assert [len(p.train_ids) for p in parts] == [4] * 5 def test_iterate(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:2])) it = iter(det.trains()) tid, d = next(it) assert d['image.data'].shape == (16, 1, 128, 256, 256) assert d['image.data'].dims == ('module', 'train', 'pulse', 'slow_scan', 'fast_scan') tid, d = next(it) assert d['image.data'].shape == (16, 1, 128, 256, 256) with pytest.raises(StopIteration): next(it) def test_iterate_pulse_id(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) tid, d = next(iter(det.trains(pulses=by_id[0]))) assert d['image.data'].shape == (16, 1, 1, 256, 256) tid, d = next(iter(det.trains(pulses=by_id[:5]))) assert d['image.data'].shape == (16, 1, 5, 256, 256) tid, d = next(iter(det.trains(pulses=by_id[122:]))) assert d['image.data'].shape == (16, 1, 6, 256, 256) tid, d = next(iter(det.trains(pulses=by_id[[1, 7, 22, 23]]))) assert d['image.data'].shape == (16, 1, 4, 256, 256) assert list(d['image.data'].coords['pulse']) == [1, 7, 22, 23] def test_iterate_pulse_index(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) tid, d = next(iter(det.trains(pulses=by_index[0]))) assert d['image.data'].shape == (16, 1, 1, 256, 256) tid, d = next(iter(det.trains(pulses=by_index[:5]))) assert d['image.data'].shape == (16, 1, 5, 256, 256) tid, d = next(iter(det.trains(pulses=by_index[122:]))) assert d['image.data'].shape == (16, 1, 6, 256, 256) tid, d = next(iter(det.trains(pulses=by_index[[1, 7, 22, 23]]))) assert d['image.data'].shape == (16, 1, 4, 256, 256) assert list(d['image.data'].coords['pulse']) == [1, 7, 22, 23] def test_iterate_lpd_parallel_gain(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:3]), parallel_gain=True) tid, d = next(iter(det.trains())) assert d['image.data'].shape == (16, 1, 3, 100, 256, 256) assert d['image.data'].dims == \ ('module', 'train', 'gain', 'pulse', 'slow_scan', 'fast_scan') def test_iterate_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run) tid, d = next(iter(jf.trains())) assert tid == 10000 assert d['data.adc'].shape == (8, 16, 512, 1024) assert d['data.adc'].dims == ('module', 'cell', 'slow_scan', 'fast_scan') def test_modern_corr_sources(mock_modern_spb_proc_run, mock_spb_raw_run_fmt1): run_raw = RunDirectory(mock_spb_raw_run_fmt1) run_proc = RunDirectory(mock_modern_spb_proc_run) combined = run_raw.union(run_proc.select("*/CORR/*:output")) corr_sources = {f'SPB_DET_AGIPD1M-1/CORR/{i}CH0:output' for i in range(16)} det_sources = {f'SPB_DET_AGIPD1M-1/DET/{i}CH0:xtdf' for i in range(16)} # Specify that we want raw data assert AGIPD1M(run_raw, raw=True).data.all_sources == det_sources with pytest.raises(Exception): AGIPD1M(run_proc, raw=True) agipd_raw = AGIPD1M(combined, raw=True) assert agipd_raw.data.all_sources == det_sources assert 'image.mask' not in agipd_raw # Specify that we want corrected data with pytest.raises(Exception): AGIPD1M(run_raw, raw=False) assert AGIPD1M(run_proc, raw=False).data.all_sources == corr_sources agipd_proc = AGIPD1M(combined, raw=False) assert agipd_proc.data.all_sources == corr_sources assert 'image.mask' in agipd_proc # Legacy behaviour: prefer corrected, allow raw if only that is found assert AGIPD1M(run_raw).data.all_sources == det_sources assert AGIPD1M(run_proc).data.all_sources == corr_sources agipd_dflt = AGIPD1M(combined) assert agipd_dflt.data.all_sources == corr_sources assert 'image.mask' in agipd_dflt def test_write_virtual_cxi(mock_spb_proc_run, tmpdir): run = RunDirectory(mock_spb_proc_run) det = AGIPD1M(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape[1:] == (16, 512, 128) assert 'axes' in ds.attrs assert len(ds.virtual_sources()) == 16 # Check position of each source file in the modules dimension for src in ds.virtual_sources(): start, _, block, count = src.vspace.get_regular_hyperslab() assert block[1] == 1 assert count[1] == 1 expected_file = 'CORR-R0238-AGIPD{:0>2}-S00000.h5'.format(start[1]) assert osp.basename(src.file_name) == expected_file # Check presence of other datasets assert 'gain' in det_grp assert 'mask' in det_grp assert 'experiment_identifier' in det_grp def test_write_virtual_cxi_some_modules(mock_spb_proc_run, tmpdir): run = RunDirectory(mock_spb_proc_run) det = AGIPD1M(run, modules=[3, 4, 8, 15]) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (16, 512, 128) def test_write_virtual_cxi_jungfrau(mock_jungfrau_run, tmpdir): run = RunDirectory(mock_jungfrau_run) det = JUNGFRAU(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape[1:] == (8, 512, 1024) assert 'axes' in ds.attrs assert len(ds.virtual_sources()) == 8 # Check position of each source file in the modules dimension for src in ds.virtual_sources(): start, _, block, count = src.vspace.get_regular_hyperslab() assert block[1] == 1 assert count[1] == 1 expected_file = 'RAW-R0012-JNGFR{:0>2}-S00000.h5'.format( start[1] + 1) assert osp.basename(src.file_name) == expected_file # Check presence of other datasets assert 'gain' in det_grp assert 'mask' in det_grp assert 'experiment_identifier' in det_grp def test_write_virtual_cxi_jungfrau_some_modules(mock_jungfrau_run, tmpdir): run = RunDirectory(mock_jungfrau_run) det = JUNGFRAU(run, modules=[2, 3, 4, 6]) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (8, 512, 1024) np.testing.assert_array_equal(det_grp['module_identifier'][:], np.arange(1,9)) def test_write_virtual_cxi_raw_data(mock_fxe_raw_run, tmpdir, caplog): import logging caplog.set_level(logging.INFO) run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (16, 1, 256, 256) def test_write_virtual_cxi_reduced_data(mock_reduced_spb_proc_run, tmpdir): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (16, 512, 128) def test_write_selected_frames(mock_spb_raw_run, tmp_path): run = RunDirectory(mock_spb_raw_run) det = AGIPD1M(run) trains = np.repeat(np.arange(10000, 10006), 2) pulses = np.tile([0, 5], 6) test_file = tmp_path / 'sel_frames.h5' det.write_frames(test_file, trains, pulses) assert_isfile(test_file) with H5File(test_file) as f: np.testing.assert_array_equal( f.get_array('SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId')[:, 0], pulses ) assert f.instrument_sources == { f'SPB_DET_AGIPD1M-1/DET/{i}CH0:xtdf' for i in range(16) } # pytest leaves temp files for inspection, but these files are big enough # to be inconvenient, so delete them if the assertions have passed. test_file.unlink() def test_write_selected_frames_proc(mock_spb_proc_run, tmp_path): run = RunDirectory(mock_spb_proc_run) det = AGIPD1M(run) trains = np.repeat(np.arange(10000, 10006), 2) pulses = np.tile([0, 7], 6) test_file = tmp_path / 'sel_frames.h5' det.write_frames(test_file, trains, pulses) assert_isfile(test_file) with H5File(test_file) as f: np.testing.assert_array_equal( f.get_array('SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'), pulses ) assert f.instrument_sources == { f'SPB_DET_AGIPD1M-1/DET/{i}CH0:xtdf' for i in range(16) } # pytest leaves temp files for inspection, but these files are big enough # to be inconvenient, so delete them if the assertions have passed. test_file.unlink() def test_identify_multimod_detectors(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) name, cls = identify_multimod_detectors(run, single=True) assert name == 'FXE_DET_LPD1M-1' assert cls is LPD1M dets = identify_multimod_detectors(run, single=False) assert dets == {(name, cls)} def test_identify_multimod_detectors_multi(mock_fxe_raw_run, mock_spb_raw_run): fxe_run = RunDirectory(mock_fxe_raw_run) spb_run = RunDirectory(mock_spb_raw_run) combined = fxe_run.select('*LPD1M*').union(spb_run) dets = identify_multimod_detectors(combined, single=False) assert dets == {('FXE_DET_LPD1M-1', LPD1M), ('SPB_DET_AGIPD1M-1', AGIPD1M)} with pytest.raises(ValueError): identify_multimod_detectors(combined, single=True) name, cls = identify_multimod_detectors(combined, single=True, clses=[AGIPD1M]) assert name == 'SPB_DET_AGIPD1M-1' assert cls is AGIPD1M ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_copy.py0000644000175100001660000000470614757376472021207 0ustar00runnerdockerfrom pathlib import Path import h5py import numpy as np from extra_data.copy import copy_structure, main def test_copy_structure(tmp_path, mock_sa3_control_data): xgm = "SA3_XTD10_XGM/XGM/DOOCS" xgm_intensity = f"INSTRUMENT/{xgm}:output/data/intensityTD" xgm_flux = f"CONTROL/{xgm}/pulseEnergy/photonFlux/value" ext_file = 'ext-data.h5' ext_path = 'some/data' with h5py.File(mock_sa3_control_data, "a") as f: # add some data ds = f[xgm_intensity] ds[:] = np.ones(ds.shape, ds.dtype) ds = f[xgm_flux] ds[:] = np.ones(ds.shape, ds.dtype) # add softlink f["group/SOFTLINKED"] = h5py.SoftLink(f"/{xgm_intensity}") # add hardlink f['group/HARDLINKED'] = ds # add external link with h5py.File(Path(mock_sa3_control_data).parent / ext_file, 'w') as g: g[ext_path] = [1] f['group/EXTLINK'] = h5py.ExternalLink(ext_file, ext_path) copy_structure(mock_sa3_control_data, tmp_path, control_data=True) inp = h5py.File(mock_sa3_control_data) out = h5py.File(tmp_path / mock_sa3_control_data.rpartition("/")[-1]) slink = out.get("group/SOFTLINKED", getlink=True) extlink = out.get('group/EXTLINK', getlink=True) # softlinks are copied assert isinstance(slink, h5py.SoftLink) assert slink.path == f"/{xgm_intensity}" # hardlink assert out['group/HARDLINKED'] == out[xgm_flux] # external link assert extlink.filename == ext_file assert extlink.path == ext_path # data is not copied assert out[xgm_intensity].shape == inp[xgm_intensity].shape assert out[xgm_intensity].dtype == inp[xgm_intensity].dtype assert (out[xgm_intensity][()] == 0).all() # attributes are copied assert out[xgm_intensity].attrs["unitName"] == "joule" # control data is copied assert out[xgm_flux].shape == inp[xgm_flux].shape assert out[xgm_flux].dtype == inp[xgm_flux].dtype assert (out[xgm_flux][()] == 1).all() # run data is not copied assert out[f"RUN/{xgm}/classId/value"].dtype == h5py.string_dtype() assert out[f"RUN/{xgm}/classId/value"][()] == [b""] def test_copy_run(tmp_path, mock_spb_proc_run): copy_structure(mock_spb_proc_run, tmp_path) inp_files = list(Path(mock_spb_proc_run).glob('*.h5')) out_files = list(tmp_path.glob('*.h5')) assert len(inp_files) == len(out_files) def test_cli(tmp_path, mock_scs_run): # smoke test main([mock_scs_run, str(tmp_path)]) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_file_access.py0000644000175100001660000001251014757376472022465 0ustar00runnerdockerimport gc import pickle import cloudpickle import pytest import numpy as np from .. import RunDirectory from ..file_access import FileAccess from ..exceptions import FileStructureError def test_registry(mock_sa3_control_data): fa = FileAccess(mock_sa3_control_data) # Load some data to populate caches fa.get_index('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data') fa.get_keys('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput') assert len(fa._index_cache) == 1 assert len(fa._keys_cache) == 1 # This should get a reference to the existing object, not a duplicate. fa2 = FileAccess(mock_sa3_control_data) assert fa2 is fa # __init__() should not have been called again assert len(fa._index_cache) == 1 assert len(fa._keys_cache) == 1 @pytest.mark.parametrize('pickle_mod', (pickle, cloudpickle)) def test_pickle(pickle_mod, mock_sa3_control_data): fa = FileAccess(mock_sa3_control_data) b = pickle_mod.dumps(fa) # Load some data to populate caches fa.get_index('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data') fa.get_keys('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput') assert len(fa._index_cache) == 1 assert len(fa._keys_cache) == 1 # Unpickling should get a reference to the existing object, not a duplicate. fa2 = pickle_mod.loads(b) assert fa2 is fa # Unpickling should not update state of existing object assert len(fa._index_cache) == 1 assert len(fa._keys_cache) == 1 # Delete the existing instances, then reload from pickle del fa, fa2 gc.collect() fa3 = pickle_mod.loads(b) assert len(fa3._index_cache) == 0 assert len(fa3._keys_cache) == 0 assert 'SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput' in fa3.instrument_sources assert len(fa3.train_ids) == 500 # Empty FileAccess cache entry to test behaviour without actually trying # to read a non-existing file in tests below. _empty_cache_info = dict( train_ids= np.zeros(0, dtype=np.uint64), control_sources=frozenset(), instrument_sources=frozenset(), flag=np.zeros(0, dtype=np.int32), legacy_sources={}, ) def test_euxfel_path_infos(mock_sa3_control_data, monkeypatch): fa = FileAccess(mock_sa3_control_data) # Default path is not a EuXFEL storage location. assert fa.storage_class is None assert fa.instrument is None assert fa.cycle is None # EuXFEL locations are resolved to their true paths and may either # be on online GPFS, offline GPFS or dCache. for filename in [ '/gpfs/exfel/exp/SA3/202301/p001234/raw/r0100/foo.h5', '/gpfs/exfel/d/raw/SA3/202301/p001234/r0100/foo.h5', '/pnfs/xfel.eu/exfel/archive/XFEL/raw/SA3/202301/p001234/r0100/foo.h5' ]: fa = FileAccess(filename, _cache_info=_empty_cache_info) assert fa.storage_class == 'raw' assert fa.instrument == 'SA3' assert fa.cycle == '202301' def test_euxfel_filename_infos(mock_sa3_control_data, monkeypatch): fa = FileAccess(mock_sa3_control_data) assert fa.data_category == 'RAW' assert fa.aggregator == 'DA01' assert fa.sequence == 1 fa = FileAccess('/a/b/c/my-own-file.h5', _cache_info=_empty_cache_info) assert fa.data_category is None assert fa.aggregator is None assert fa.sequence is None def test_no_index(empty_h5_file): with pytest.raises(FileStructureError): FileAccess(empty_h5_file) def test_no_metadata(mock_no_metadata_file): with pytest.raises(FileStructureError): FileAccess(mock_no_metadata_file) @pytest.mark.parametrize( ['source', 'index_group'], [('SPB_XTD9_XGM/DOOCS/MAIN', None), ('SPB_XTD9_XGM/DOOCS/MAIN', ''), ('SPB_DET_AGIPD1M-1/DET/4CH0:xtdf', None), ('SPB_DET_AGIPD1M-1/DET/4CH0:xtdf', 'image')], ids=['control-None', 'control-empty', 'instrument-*', 'instrument-image']) def test_one_key(mock_spb_raw_run, source, index_group): # Get first file of the chosen source. fa = RunDirectory(mock_spb_raw_run)[source].files[0] # Collect valid key results, this may populate the cache. keys = [k for k in fa.get_keys(source) if index_group is None or k.startswith(index_group)] # Test with use of _keys_cache. assert fa.get_one_key(source, index_group) in keys # Force an empty cache. fa._keys_cache.clear() fa._known_keys.clear() # Test with direct file access. assert fa.get_one_key(source, index_group) in keys # Force an empty cache. fa._keys_cache.clear() fa._known_keys.clear() # Ask for a specific key to populate _known_keys. single_key = next(iter(keys)) assert fa.has_source_key(source, single_key) # Test with use of _known_keys. assert fa.get_one_key(source, index_group) == single_key def test_legacy_sources(mock_modern_spb_proc_run): # Get FileAccess for first module. fa = sorted(RunDirectory(mock_modern_spb_proc_run).files, key=lambda fa: fa.filename)[0] # There should be no control source. assert not fa.control_sources # Instrument sources should be a set of both canonical and legacy name. assert fa.instrument_sources == { 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'SPB_DET_AGIPD1M-1/CORR/0CH0:output'} # Legacy sources should be a dict mapping to the canonical name. assert fa.legacy_sources == { 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf': 'SPB_DET_AGIPD1M-1/CORR/0CH0:output'} ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_keydata.py0000644000175100001660000003165514757376472021662 0ustar00runnerdockerimport os import numpy as np import xarray as xr import pytest import h5py from extra_data import RunDirectory, H5File from extra_data.exceptions import TrainIDError, NoDataError from . import make_examples from .mockdata import write_file from .mockdata.xgm import XGM def test_get_keydata(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) print(run.instrument_sources) am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] assert len(am0.files) == 1 assert am0.section == 'INSTRUMENT' assert am0.is_instrument assert am0.entry_shape == (2, 512, 128) assert am0.ndim == 4 assert am0.dtype == np.dtype('u2') assert {p.name for p in am0.source_file_paths} == { 'RAW-R0238-AGIPD00-S00000.h5' } xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert len(xgm_beam_x.files) == 2 assert xgm_beam_x.section == 'CONTROL' assert xgm_beam_x.is_control assert xgm_beam_x.entry_shape == () assert xgm_beam_x.ndim == 1 assert xgm_beam_x.dtype == np.dtype('f4') assert {p.name for p in xgm_beam_x.source_file_paths} == { 'RAW-R0238-DA01-S00000.h5', 'RAW-R0238-DA01-S00001.h5' } data = xgm_beam_x.ndarray() assert xgm_beam_x.nbytes == data.nbytes # Ensure KeyData is not accidentally iterable with pytest.raises(TypeError): iter(xgm_beam_x) def test_select_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert xgm_beam_x.shape == (64,) sel1 = xgm_beam_x[:20] # Equivalent to .select_trains(np.s_[:20]) assert sel1.shape == (20,) assert len(sel1.files) == 1 # Empty selection sel2 = xgm_beam_x[80:] assert sel2.shape == (0,) assert len(sel2.files) == 1 assert sel2.xarray().shape == (0,) # Single train sel3 = xgm_beam_x[32] assert sel3.shape == (1,) def test_split_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert xgm_beam_x.shape == (64,) chunks = list(xgm_beam_x.split_trains(3)) assert len(chunks) == 3 assert {c.shape for c in chunks} == {(21,), (22,)} assert chunks[0].ndarray().shape == chunks[0].shape chunks = list(xgm_beam_x.split_trains(3, trains_per_part=20)) assert len(chunks) == 4 assert {c.shape for c in chunks} == {(16,)} def test_nodata(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) cam_pix = run['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput', 'data.image.pixels'] assert cam_pix.train_ids == list(range(10000, 10480)) assert len(cam_pix.files) == 2 assert cam_pix.shape == (0, 255, 1024) arr = cam_pix.xarray() assert arr.shape == (0, 255, 1024) assert arr.dtype == np.dtype('u2') dask_arr = cam_pix.dask_array(labelled=True) assert dask_arr.shape == (0, 255, 1024) assert dask_arr.dtype == np.dtype('u2') assert list(cam_pix.trains()) == [] tid, data = cam_pix.train_from_id(10010) assert tid == 10010 assert data.shape == (0, 255, 1024) def test_iter_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert [t for (t, _) in xgm_beam_x.trains()] == list(range(10000, 10064)) for _, v in xgm_beam_x.trains(): assert isinstance(v, np.float32) break def test_iter_trains_keep_dims(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf_data = run['SPB_IRDA_JF4M/DET/JNGFR01:daqOutput', 'data.adc'] for _, v in jf_data.trains(keep_dims=True): assert v.shape == (1, 16, 512, 1024) def test_iter_trains_include_empty(mock_sa3_control_data): f = H5File(mock_sa3_control_data) beamview = f['SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data.image.dims'] for expected_tid, (data1_tid, data1), (data2_tid, data2) in zip( beamview.train_ids, beamview.trains(include_empty=True), beamview.trains(include_empty=True, keep_dims=True) ): assert expected_tid == data1_tid == data2_tid if (expected_tid % 2) == 0: assert data1 is None else: assert data1.shape == (2,) assert data2.shape == (expected_tid % 2, 2) def test_get_train(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] tid, val = xgm_beam_x.train_from_id(10005) assert tid == 10005 assert isinstance(val, np.float32) with pytest.raises(TrainIDError): xgm_beam_x.train_from_id(11000) tid, _ = xgm_beam_x.train_from_index(-10) assert tid == 10054 with pytest.raises(IndexError): xgm_beam_x.train_from_index(9999) def test_get_train_keep_dims(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf_adc = run['SPB_IRDA_JF4M/DET/JNGFR01:daqOutput', 'data.adc'] _, val = jf_adc.train_from_id(10005, keep_dims=True) assert val.shape == (1, 16, 512, 1024) _, val = jf_adc.train_from_index(-10, keep_dims=True) assert val.shape == (1, 16, 512, 1024) def test_data_counts(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) # control data xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] count = xgm_beam_x.data_counts() assert count.index.tolist() == xgm_beam_x.train_ids assert (count.values == 1).all() # instrument data camera = run['SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', 'data.image.pixels'] count = camera.data_counts() assert count.index.tolist() == camera.train_ids mod = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] count = mod.data_counts() assert count.index.tolist() == mod.train_ids assert count.values.sum() == mod.shape[0] def test_data_counts_empty(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) cam_nodata = run['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput', 'data.image.pixels'] count_ser = cam_nodata.data_counts(labelled=True) assert len(count_ser) == 480 assert count_ser.sum() == 0 count_arr = cam_nodata.data_counts(labelled=False) assert len(count_arr) == 480 assert count_arr.sum() == 0 count_none_ser = cam_nodata.drop_empty_trains().data_counts(labelled=True) assert len(count_none_ser) == 0 count_none_arr = cam_nodata.drop_empty_trains().data_counts(labelled=False) assert len(count_none_arr) == 0 @pytest.fixture() def fxe_run_module_offset(tmp_path): run_dir = tmp_path / 'fxe-run-mod-offset' run_dir.mkdir() make_examples.make_fxe_run(run_dir, format_version='1.0') # Shift the train IDs for a module by 1, so it has data for a different set # of train IDs to other sources. with h5py.File(run_dir / 'RAW-R0450-LPD08-S00000.h5', 'r+') as f: tids_dset = f['INDEX/trainId'] tids_dset[:] = tids_dset[:] + 1 return run_dir def test_data_counts_missing_train(fxe_run_module_offset): run = RunDirectory(fxe_run_module_offset) assert len(run.train_ids) == 481 lpd_m8 = run['FXE_DET_LPD1M-1/DET/8CH0:xtdf', 'image.cellId'] ser = lpd_m8.data_counts(labelled=True) assert len(ser) == 480 np.testing.assert_array_equal(ser.index, run.train_ids[1:]) arr = lpd_m8.data_counts(labelled=False) assert len(arr) == 481 assert arr[0] == 0 np.testing.assert_array_equal(arr[1:], 128) lpd_m8_w_data = lpd_m8.drop_empty_trains() ser = lpd_m8_w_data.data_counts(labelled=True) assert len(ser) == 480 np.testing.assert_array_equal(ser.index, run.train_ids[1:]) arr = lpd_m8_w_data.data_counts(labelled=False) assert len(arr) == 480 np.testing.assert_array_equal(arr, 128) def test_select_by(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] subrun = run.select(am0) assert subrun.all_sources == {am0.source} assert subrun.keys_for_source(am0.source) == {am0.key} def test_drop_empty_trains(mock_sa3_control_data): f = H5File(mock_sa3_control_data) beamview = f['SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data.image.dims'] assert len(beamview.train_ids) == 500 a1 = beamview.ndarray() assert a1.shape == (250, 2) frame_counts = beamview.data_counts(labelled=False) assert frame_counts.shape == (500,) assert frame_counts.min() == 0 beamview_w_data = beamview.drop_empty_trains() assert len(beamview_w_data.train_ids) == 250 np.testing.assert_array_equal(beamview_w_data.ndarray(), a1) frame_counts = beamview_w_data.data_counts(labelled=False) assert frame_counts.shape == (250,) assert frame_counts.min() == 1 def test_single_value(mock_sa3_control_data, monkeypatch): f = H5File(mock_sa3_control_data) imager = f['SA3_XTD10_IMGFEL/CAM/BEAMVIEW:daqOutput', 'data.image.pixels'] flux = f['SA3_XTD10_XGM/XGM/DOOCS', 'pulseEnergy.photonFlux'] # Try without data for a source and key. with pytest.raises(NoDataError): imager.as_single_value() # FEL imager with no data. with pytest.raises(NoDataError): flux[:0].as_single_value() # No data through selection. # Monkeypatch some actual data into the KeyData object data = np.arange(flux.shape[0]) monkeypatch.setattr(flux, 'ndarray', lambda: data) # Try some tolerances that have to fail. with pytest.raises(ValueError): flux.as_single_value() with pytest.raises(ValueError): flux.as_single_value(atol=1) with pytest.raises(ValueError): flux.as_single_value(rtol=0.1) # Try with large enough tolerances. assert flux.as_single_value(atol=len(data)/2) == np.median(data) assert flux.as_single_value(rtol=0.5, atol=len(data)/4) == np.median(data) assert flux.as_single_value(rtol=1) == np.median(data) # Other reduction options assert flux.as_single_value(rtol=1, reduce_by='mean') == np.mean(data) assert flux.as_single_value(rtol=1, reduce_by=np.mean) == np.mean(data) assert flux.as_single_value(atol=len(data)-1, reduce_by='first') == 0 # Try vector data. intensity = f['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD'] data = np.repeat(data, intensity.shape[1]).reshape(-1, intensity.shape[-1]) monkeypatch.setattr(intensity, 'ndarray', lambda: data) with pytest.raises(ValueError): intensity.as_single_value() np.testing.assert_equal(intensity.as_single_value(rtol=1), np.median(data)) def test_ndarray_out(mock_spb_raw_run): f = RunDirectory(mock_spb_raw_run) cam = f['SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', 'data.image.dims'] buf_new = cam.ndarray() # New copy of data. buf_in = np.zeros(cam.shape, dtype=cam.dtype) buf_out = cam.ndarray(out=buf_in) # In-place copy of data. np.testing.assert_allclose(buf_new, buf_out) assert buf_in is buf_out def test_xarray_structured_data(mock_remi_run): run = RunDirectory(mock_remi_run) dset = run['SQS_REMI_DLD6/DET/TOP:output', 'rec.hits'].xarray() assert isinstance(dset, xr.Dataset) assert list(dset.data_vars.keys()) == ['x', 'y', 't', 'm'] arrs = list(dset.data_vars.values()) assert all([arr.shape == (100, 50) for arr in arrs]) assert all([arr.dtype == np.float64 for arr in arrs[:3]]) assert arrs[3].dtype == np.int32 np.testing.assert_equal(dset.coords['trainId'], np.arange(10000, 10100)) @pytest.fixture() def run_with_file_no_trains(mock_spb_raw_run): extra_file = os.path.join(mock_spb_raw_run, 'RAW-R0238-DA01-S00002.h5') write_file(extra_file, [ XGM('SPB_XTD9_XGM/DOOCS/MAIN'), ], ntrains=0) try: yield mock_spb_raw_run finally: os.unlink(extra_file) def test_file_no_trains(run_with_file_no_trains): run = RunDirectory(run_with_file_no_trains) xpos = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos'].ndarray() assert xpos.shape == (64,) def test_attributes(mock_sa3_control_data): run = H5File(mock_sa3_control_data) # INSTRUMENT key. xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD'] attrs = xgm_intensity.attributes() assert isinstance(attrs, dict) assert attrs['metricPrefixName'] == 'micro' assert attrs['unitSymbol'] == 'J' # CONTROL key. xgm_beampos_x = run['SA3_XTD10_XGM/XGM/DOOCS', 'beamPosition.ixPos'] attrs = xgm_beampos_x.attributes() assert isinstance(attrs, dict) assert attrs['alias'] == 'IX.POS' assert attrs['description'] == 'Calculated X position [mm]' assert attrs['daqPolicy'][0] == -1 def test_units(mock_sa3_control_data): run = H5File(mock_sa3_control_data) xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD'] assert xgm_intensity.units == 'μJ' assert xgm_intensity.units_name == 'microjoule' # Check that it still works after selecting 0 trains assert xgm_intensity.select_trains(np.s_[:0]).units == 'μJ' # units are added to xarray's attributes assert xgm_intensity.xarray().attrs['units'] == 'μJ' ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_lsxfel.py0000644000175100001660000000060714757376472021526 0ustar00runnerdockerfrom extra_data import lsxfel def test_lsxfel_file(mock_lpd_data, capsys): lsxfel.summarise_file(mock_lpd_data) out, err = capsys.readouterr() assert "480 trains, 1 source" in out def test_lsxfel_run(mock_fxe_raw_run, capsys): lsxfel.summarise_run(mock_fxe_raw_run) out, err = capsys.readouterr() assert "480 trains" in out assert "16 detector files" in out ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_open_file_limiter.py0000644000175100001660000000360414757376472023716 0ustar00runnerdockerimport gc import os import pytest from extra_data import file_access from extra_data.reader import DataCollection @pytest.fixture def files_limit_512(): orig_limiter = file_access.open_files_limiter file_access.open_files_limiter = l = file_access.OpenFilesLimiter(512) yield l file_access.open_files_limiter = orig_limiter @pytest.fixture def files_limit_3(): orig_limiter = file_access.open_files_limiter file_access.open_files_limiter = l = file_access.OpenFilesLimiter(3) yield l file_access.open_files_limiter = orig_limiter def test_filecache_large(mock_spb_raw_run, files_limit_512): fc = files_limit_512 files = [os.path.join(mock_spb_raw_run, f) \ for f in os.listdir(mock_spb_raw_run) if f.endswith('.h5')] run = DataCollection.from_paths(files) trains_iter = run.trains() tid, data = next(trains_iter) assert tid == 10000 device = 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput' assert device in data assert data[device]['data.image.pixels'].shape == (1024, 768) # 16 AGIPD files + 1st DA file, but the other sequence file may be opened assert fc.n_open_files() >= 17 del run, trains_iter gc.collect() assert fc.n_open_files() == 0 def test_filecache_small(mock_spb_raw_run, files_limit_3): fc = files_limit_3 files = [os.path.join(mock_spb_raw_run, f) \ for f in os.listdir(mock_spb_raw_run) if f.endswith('.h5')] run = DataCollection.from_paths(files) trains_iter = run.trains() for i in range(3): tid, data = next(trains_iter) assert tid == 10000 + i for j in range(16): device = f'SPB_DET_AGIPD1M-1/DET/{j}CH0:xtdf' assert device in data assert data[device]['image.data'].shape == (64, 2, 512, 128) assert len(fc._cache) == 3 del run, trains_iter gc.collect() assert fc.n_open_files() == 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_open_run.py0000644000175100001660000001434414757376472022061 0ustar00runnerdockerimport os import shutil from multiprocessing import Process from pathlib import Path from textwrap import dedent from unittest import mock from warnings import catch_warnings import numpy as np import pytest from extra_data import open_run from extra_data.reader import DEFAULT_ALIASES_FILE def test_open_run(mock_spb_raw_and_proc_run): mock_data_root, raw_run_dir, proc_run_dir = mock_spb_raw_and_proc_run with mock.patch('extra_data.read_machinery.DATA_ROOT_DIR', mock_data_root): # With integers run = open_run(proposal=2012, run=238) paths = {f.filename for f in run.files} assert paths for path in paths: assert '/raw/' in path # With strings run = open_run(proposal='2012', run='238') assert {f.filename for f in run.files} == paths # With numpy integers run = open_run(proposal=np.int64(2012), run=np.uint16(238)) assert {f.filename for f in run.files} == paths # Proc folder proc_run = open_run(proposal=2012, run=238, data='proc') proc_paths = {f.filename for f in proc_run.files} assert proc_paths for path in proc_paths: assert '/raw/' not in path # Helper function to write an alias file at a specific path def write_aliases(path): aliases_path.parent.mkdir(parents=True, exist_ok=True) aliases_path.write_text(dedent(""" xgm: SA1_XTD2_XGM/DOOCS/MAIN """)) # To set the aliases, we should be able to use a string relative to the # proposal directory. aliases_path = Path(mock_data_root) / "SPB/201830/p002012/foo.yml" write_aliases(aliases_path) run = open_run(2012, 238, data="all", aliases="{}/foo.yml") assert "xgm" in run.alias # And a proper path aliases_path = Path(mock_data_root) / "foo.yml" write_aliases(aliases_path) run = open_run(2012, 238, aliases=aliases_path) assert "xgm" in run.alias # And a plain string run = open_run(2012, 238, aliases=str(aliases_path)) assert "xgm" in run.alias # If the default file exists, it should be used automatically aliases_path = Path(DEFAULT_ALIASES_FILE.format(mock_data_root + "/SPB/201830/p002012")) write_aliases(aliases_path) run = open_run(2012, 238) assert "xgm" in run.alias # Check that aliases are loaded for old proposals where proc contains # all sources from raw too. Necessary because the aliases are only # loaded once for the raw data but the proc DataCollection will be used # if all sources exist in proc. shutil.rmtree(proc_run_dir) shutil.copytree(raw_run_dir, proc_run_dir) run = open_run(2012, 238, data="all") assert "xgm" in run.alias @pytest.mark.parametrize('location', ['all', ['raw', 'proc']], ids=['all', 'list']) def test_open_run_multiple(mock_spb_raw_and_proc_run, location): mock_data_root, raw_run_dir, proc_run_dir = mock_spb_raw_and_proc_run with mock.patch('extra_data.read_machinery.DATA_ROOT_DIR', mock_data_root): # Separate folders raw_run = open_run(proposal=2012, run=238, data='raw') proc_run = open_run(proposal=2012, run=238, data='proc') # All folders all_run = open_run(proposal=2012, run=238, data=location) # Raw contains all sources. assert raw_run.all_sources == all_run.all_sources # Proc is a true subset. assert proc_run.all_sources < all_run.all_sources for source, srcdata in all_run._sources_data.items(): for file in srcdata.files: if '/DET/' in source: # AGIPD data is in proc. assert '/raw/' not in file.filename else: # Non-AGIPD data is in raw. # (CAM, XGM) assert '/proc/' not in file.filename # Delete the proc data shutil.rmtree(proc_run_dir) assert not os.path.isdir(proc_run_dir) with catch_warnings(record=True) as w: # Opening a run with 'all', with no proc data all_run = open_run(proposal=2012, run=238, data=location) # Attempting to open the proc data should raise a warning assert len(w) == 1 # It should have opened at least the raw data assert raw_run.all_sources == all_run.all_sources # Run that doesn't exist with pytest.raises(Exception): open_run(proposal=2012, run=999) # run directory exists but contains no data os.makedirs(proc_run_dir) with catch_warnings(record=True) as w: open_run(proposal=2012, run=238, data=location) assert len(w) == 1 def test_open_run_default(mock_spb_raw_and_modern_proc_run): mock_data_root, raw_run_dir, proc_run_dir = mock_spb_raw_and_modern_proc_run with mock.patch('extra_data.read_machinery.DATA_ROOT_DIR', mock_data_root): run = open_run(proposal=2012, run=238, data='default') # /DET/ names should come from raw data det_sources = {f'SPB_DET_AGIPD1M-1/DET/{m}CH0:xtdf' for m in range(16)} for s in det_sources: assert 'image.gain' not in run[s] for file in run[s].files: assert '/raw/' in file.filename # /CORR/ names should come from corrected data corr_sources = {f'SPB_DET_AGIPD1M-1/CORR/{m}CH0:output' for m in range(16)} for s in corr_sources: assert 'image.gain' in run[s] for file in run[s].files: assert '/proc/' in file.filename assert run.legacy_sources == {} def open_run_daemonized_helper(mock_data_root): with mock.patch('extra_data.read_machinery.DATA_ROOT_DIR', mock_data_root): open_run(2012, 238, data="all", parallelize=False) def test_open_run_daemonized(mock_spb_raw_and_proc_run): mock_data_root, raw_run_dir, proc_run_dir = mock_spb_raw_and_proc_run # Daemon processes can't start their own children, check that opening a run is still possible. p = Process(target=open_run_daemonized_helper, args=(mock_data_root,), daemon=True) p.start() p.join() assert p.exitcode == 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_read_machinery.py0000644000175100001660000001106214757376472023200 0ustar00runnerdockerimport os import os.path as osp from unittest import mock import numpy as np import pytest from extra_data import RunDirectory, by_id, by_index, read_machinery from extra_data.read_machinery import select_train_ids def test_find_proposal(tmpdir): prop_dir = osp.join(str(tmpdir), 'SPB', '201701', 'p002012') os.makedirs(prop_dir) with mock.patch.object(read_machinery, 'DATA_ROOT_DIR', str(tmpdir)): assert read_machinery.find_proposal('p002012') == prop_dir assert read_machinery.find_proposal(prop_dir) == prop_dir def test_same_run(mock_spb_raw_run, mock_jungfrau_run, mock_scs_run): run_spb = RunDirectory(mock_spb_raw_run) run_jf = RunDirectory(mock_jungfrau_run) run_scs = RunDirectory(mock_scs_run) assert run_spb.is_single_run assert run_jf.is_single_run assert run_scs.is_single_run assert not read_machinery.same_run(run_spb, run_scs, run_jf) format_version = run_spb.files[0].format_version s1 = run_spb.select_trains(np.s_[:1]) s2 = run_spb.select_trains(np.s_[:-1]) s3 = run_spb.select_trains(np.s_[3]) s4 = run_spb.select('SA1_XTD2_XGM/DOOCS/MAIN', '*') s5 = run_spb.select('SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', '*') if run_spb.run_metadata()['dataFormatVersion'] != '0.5': assert read_machinery.same_run(s1, s2, s3) assert read_machinery.same_run(s4, s5) else: assert not read_machinery.same_run(s1, s2, s3) assert not read_machinery.same_run(s4, s5) # SourceData sd = run_spb['SA1_XTD2_XGM/DOOCS/MAIN'] sd1 = sd.select_trains(np.s_[:1]) sd2 = sd.select_trains(np.s_[-1:]) assert sd.is_single_run if sd.run_metadata()['dataFormatVersion'] != '0.5': assert read_machinery.same_run(sd1, sd2) else: assert not read_machinery.same_run(sd1, sd2) def test_select_train_ids(): train_ids = list(range(1000000, 1000010)) # Test by_id with a single integer assert select_train_ids(train_ids, by_id[1000002]) == [1000002] # Test by_id with a numpy type assert select_train_ids(train_ids, by_id[np.uint64(1000002)]) == [1000002] # Test by_id with a slice assert select_train_ids(train_ids, by_id[1000002:1000005]) == [1000002, 1000003, 1000004] # Test by_id with a list assert select_train_ids(train_ids, by_id[[1000002, 1000005]]) == [1000002, 1000005] # Test by_id with a numpy array assert select_train_ids(train_ids, by_id[np.array([1000002, 1000005])]) == [1000002, 1000005] # Test by_id with a slice and step assert select_train_ids(train_ids, by_id[1000000:1000008:2]) == [1000000, 1000002, 1000004, 1000006] # Test by_id with an open-ended slice (end) assert select_train_ids(train_ids, by_id[1000005:]) == [1000005, 1000006, 1000007, 1000008, 1000009] # Test by_id with an open-ended slice (start) assert select_train_ids(train_ids, by_id[:1000003]) == [1000000, 1000001, 1000002] # Test by_index with a single integer assert select_train_ids(train_ids, by_index[2]) == [1000002] # Test by_index with a 0D numpy array assert select_train_ids(train_ids, by_index[np.array(5, dtype=np.int8)]) == [1000005] # Test by_index with a slice assert select_train_ids(train_ids, by_index[1:4]) == [1000001, 1000002, 1000003] # Test by_index with a list assert select_train_ids(train_ids, by_index[[1, 3]]) == [1000001, 1000003] # Test by_index with a slice and step assert select_train_ids(train_ids, by_index[::2]) == [1000000, 1000002, 1000004, 1000006, 1000008] # Test with a plain integer assert select_train_ids(train_ids, 3) == [1000003] # Test with a plain int-like assert select_train_ids(train_ids, np.uint32(3)) == [1000003] # Test with a plain slice assert select_train_ids(train_ids, slice(1, 4)) == [1000001, 1000002, 1000003] # Test with a plain list assert select_train_ids(train_ids, [1, 3]) == [1000001, 1000003] # Test with a numpy array assert select_train_ids(train_ids, np.array([1, 3])) == [1000001, 1000003] # Test with an invalid type (should raise TypeError) with pytest.raises(TypeError): select_train_ids(train_ids, "invalid") with pytest.raises(TypeError): select_train_ids(train_ids, by_id[np.float64(1000006)]) # Test by_id with train IDs not found in the list (should raise a warning) with pytest.warns(UserWarning): result = select_train_ids(train_ids, by_id[[999999, 1000010]]) assert result == [] with pytest.warns(UserWarning): result = select_train_ids(train_ids, by_id[1000010]) assert result == [] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_reader_mockdata.py0000644000175100001660000011437014757376472023341 0ustar00runnerdockerfrom datetime import datetime, timedelta, timezone from itertools import islice from warnings import catch_warnings import h5py import numpy as np import os import pandas as pd import pytest import stat from tempfile import mkdtemp from testpath import assert_isfile from xarray import DataArray from extra_data import ( H5File, RunDirectory, by_index, by_id, SourceNameError, PropertyNameError, DataCollection, MultiRunError ) def test_iterate_trains(mock_agipd_data, mock_control_data_with_empty_source): with H5File(mock_agipd_data) as f: for train_id, data in islice(f.trains(), 10): assert train_id in range(10000, 10250) assert 'SPB_DET_AGIPD1M-1/DET/7CH0:xtdf' in data assert len(data) == 1 assert 'image.data' in data['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf'] with H5File(mock_control_data_with_empty_source) as f: # smoke test tid, data = next(f.trains()) assert list(data['SA3_XTD10_VAC/GAUGE/G30520C'].keys()) == ['metadata'] def test_iterate_trains_flat_keys(mock_agipd_data): with H5File(mock_agipd_data) as f: for train_id, data in islice(f.trains(flat_keys=True), 10): assert train_id in range(10000, 10250) assert ('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.data') in data def test_iterate_trains_keep_dims(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) for _, data in islice(run.select( '*JF4M/DET/*', 'data.adc' ).trains(keep_dims=True), 10): assert data[ 'SPB_IRDA_JF4M/DET/JNGFR01:daqOutput']['data.adc'].shape == ( 1, 16, 512, 1024) def test_get_train_keep_dims(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) _, data = run.select( '*JF4M/DET/*', 'data.adc').train_from_index(0, keep_dims=True) assert data[ 'SPB_IRDA_JF4M/DET/JNGFR01:daqOutput']["data.adc"].shape == ( 1, 16, 512, 1024) def test_get_train_bad_device_name(mock_spb_control_data_badname): # Check that we can handle devices which don't have the standard Karabo # name structure A/B/C. with H5File(mock_spb_control_data_badname) as f: train_id, data = f.train_from_id(10004) assert train_id == 10004 device = 'SPB_IRU_SIDEMIC_CAM:daqOutput' assert device in data assert 'data.image.dims' in data[device] dims = data[device]['data.image.dims'] assert list(dims) == [1000, 1000] def test_detector_info_oldfmt(mock_agipd_data): with H5File(mock_agipd_data) as f: di = f.detector_info('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf') assert di['dims'] == (512, 128) assert di['frames_per_train'] == 64 assert di['total_frames'] == 16000 def test_detector_info(mock_lpd_data): with H5File(mock_lpd_data) as f: di = f.detector_info('FXE_DET_LPD1M-1/DET/0CH0:xtdf') assert di['dims'] == (256, 256) assert di['frames_per_train'] == 128 assert di['total_frames'] == 128 * 480 def test_train_info(mock_lpd_data, capsys): with H5File(mock_lpd_data) as f: f.train_info(10004) out, err = capsys.readouterr() assert "Devices" in out assert "FXE_DET_LPD1M-1/DET/0CH0:xtdf" in out def test_info(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) run.info(details_for_sources='*/DOOCS/*') # Smoketest def test_iterate_trains_fxe(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: for train_id, data in islice(f.trains(), 10): assert train_id in range(10000, 10400) assert 'SA1_XTD2_XGM/DOOCS/MAIN' in data.keys() assert 'beamPosition.ixPos.value' in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'data.image.pixels' in data['FXE_XAD_GEC/CAM/CAMERA:daqOutput'] assert 'data.image.pixels' not in data['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput'] def test_iterate_file_select_trains(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: tids = [tid for (tid, _) in f.trains(train_range=by_id[:10003])] assert tids == [10000, 10001, 10002] tids = [tid for (tid, _) in f.trains(train_range=by_index[-2:])] assert tids == [10398, 10399] def test_iterate_trains_select_keys(mock_fxe_control_data): sel = { 'SA1_XTD2_XGM/DOOCS/MAIN': { 'beamPosition.ixPos.value', 'beamPosition.ixPos.timestamp', } } with H5File(mock_fxe_control_data) as f: for train_id, data in islice(f.trains(devices=sel), 10): assert train_id in range(10000, 10400) assert 'SA1_XTD2_XGM/DOOCS/MAIN' in data.keys() assert 'beamPosition.ixPos.value' in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'beamPosition.ixPos.timestamp' in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'beamPosition.iyPos.value' not in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'SA3_XTD10_VAC/TSENS/S30160K' not in data def test_iterate_trains_require_all(mock_sa3_control_data): with H5File(mock_sa3_control_data) as f: trains_iter = f.trains( devices=[('*/CAM/BEAMVIEW:daqOutput', 'data.image.dims')], require_all=True ) tids = [t for (t, _) in trains_iter] assert tids == [] trains_iter = f.trains( devices=[('*/CAM/BEAMVIEW:daqOutput', 'data.image.dims')], require_all=False ) tids = [t for (t, _) in trains_iter] assert tids != [] def test_read_fxe_raw_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert len(run.files) == 18 # 16 LPD 1M + 2 control data files assert run.train_ids == list(range(10000, 10480)) run.info() # Smoke test def test_read_fxe_raw_run_selective(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run, include='*DA*') assert run.train_ids == list(range(10000, 10480)) assert 'SA1_XTD2_XGM/DOOCS/MAIN' in run.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' not in run.detector_sources run = RunDirectory(mock_fxe_raw_run, include='*LPD*') assert run.train_ids == list(range(10000, 10480)) assert 'SA1_XTD2_XGM/DOOCS/MAIN' not in run.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in run.detector_sources run = RunDirectory(mock_fxe_raw_run, file_filter=lambda x: [f for f in x if "LPD" in f]) assert run.train_ids == list(range(10000, 10480)) assert 'SA1_XTD2_XGM/DOOCS/MAIN' not in run.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in run.detector_sources def test_read_spb_proc_run(mock_spb_proc_run): run = RunDirectory(mock_spb_proc_run) #Test for calib data assert len(run.files) == 16 # only 16 detector modules for calib data assert run.train_ids == list(range(10000, 10064)) #64 trains tid, data = next(run.trains()) device = 'SPB_DET_AGIPD1M-1/DET/15CH0:xtdf' assert tid == 10000 for prop in ('image.gain', 'image.mask', 'image.data'): assert prop in data[device] assert 'u1' == data[device]['image.gain'].dtype assert 'u4' == data[device]['image.mask'].dtype assert 'f4' == data[device]['image.data'].dtype run.info() # Smoke test run.plot_missing_data() # Smoke test def test_iterate_spb_raw_run(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) trains_iter = run.trains() tid, data = next(trains_iter) assert tid == 10000 device = 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput' assert device in data assert data[device]['data.image.pixels'].shape == (1024, 768) def test_iterate_spb_raw_run_keep_dims(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) trains_iter = run.select( 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', 'data.image.pixels').trains(keep_dims=True) _, data = next(trains_iter) assert data[ 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput']['data.image.pixels' ].shape == (1, 1024, 768) def test_properties_fxe_raw_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert run.train_ids == list(range(10000, 10480)) assert 'SPB_XTD9_XGM/DOOCS/MAIN' in run.control_sources assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in run.instrument_sources def test_iterate_fxe_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) trains_iter = run.trains() tid, data = next(trains_iter) assert tid == 10000 assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' in data assert 'firmwareVersion.value' in data['FXE_XAD_GEC/CAM/CAMERA'] def test_iterate_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) tids = [tid for (tid, _) in run.trains(train_range=by_id[10004:10006])] assert tids == [10004, 10005] tids = [tid for (tid, _) in run.trains(train_range=by_id[:10003])] assert tids == [10000, 10001, 10002] # Overlap with start of run tids = [tid for (tid, _) in run.trains(train_range=by_id[9000:10003])] assert tids == [10000, 10001, 10002] # Overlap with end of run tids = [tid for (tid, _) in run.trains(train_range=by_id[10478:10500])] assert tids == [10478, 10479] # Not overlapping with catch_warnings(record=True) as w: tids = [tid for (tid, _) in run.trains(train_range=by_id[9000:9050])] assert tids == [] assert 'before' in str(w[0].message) with catch_warnings(record=True) as w: tids = [tid for (tid, _) in run.trains(train_range=by_id[10500:10550])] assert tids == [] assert 'after' in str(w[0].message) tids = [tid for (tid, _) in run.trains(train_range=by_index[4:6])] assert tids == [10004, 10005] def test_iterate_run_glob_devices(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) trains_iter = run.trains([("*/DET/*", "image.data")]) tid, data = next(trains_iter) assert tid == 10000 assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'detector.data' not in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' not in data def test_train_by_id(mock_fxe_raw_run, mock_control_data_with_empty_source): # full run run = RunDirectory(mock_fxe_raw_run) _, data = run.train_from_id(10024) assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' in data assert 'firmwareVersion.value' in data['FXE_XAD_GEC/CAM/CAMERA'] # selection run = RunDirectory(mock_fxe_raw_run) _, data = run.train_from_id(10024, [('*/DET/*', 'image.data')]) assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' not in data # missing control data with H5File(mock_control_data_with_empty_source) as f: _, data = f.train_from_id(10000) assert 'SA3_XTD10_VAC/GAUGE/G30520C' in data assert ['metadata'] == list(data['SA3_XTD10_VAC/GAUGE/G30520C'].keys()) def test_train_from_index_fxe_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) _, data = run.train_from_index(479) assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' in data assert 'firmwareVersion.value' in data['FXE_XAD_GEC/CAM/CAMERA'] def test_file_get_series_control(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: s = f.get_series('SA1_XTD2_XGM/DOOCS/MAIN', "beamPosition.iyPos.value") assert isinstance(s, pd.Series) assert len(s) == 400 assert s.index[0] == 10000 def test_file_get_series_instrument(mock_spb_proc_run): agipd_file = os.path.join(mock_spb_proc_run, 'CORR-R0238-AGIPD07-S00000.h5') with H5File(agipd_file) as f: s = f.get_series('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'header.linkId') assert isinstance(s, pd.Series) assert len(s) == 64 assert s.index[0] == 10000 # Multiple readings per train s2 = f.get_series('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId') assert isinstance(s2, pd.Series) assert not s2.index.is_unique assert len(s2) == 64 * 64 assert len(s2.loc[10000:10004]) == 5 * 64 sel = f.select_trains(by_index[5:10]) s3 = sel.get_series('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId') assert isinstance(s3, pd.Series) assert not s3.index.is_unique assert len(s3) == 5 * 64 np.testing.assert_array_equal( s3.index.values, np.arange(10005, 10010).repeat(64) ) def test_run_get_series_control(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) s = run.get_series('SA1_XTD2_XGM/DOOCS/MAIN', "beamPosition.iyPos.value") assert isinstance(s, pd.Series) assert len(s) == 480 assert list(s.index) == list(range(10000, 10480)) def test_run_get_series_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel = run.select_trains(by_id[10100:10150]) s = sel.get_series('SA1_XTD2_XGM/DOOCS/MAIN', "beamPosition.iyPos.value") assert isinstance(s, pd.Series) assert len(s) == 50 assert list(s.index) == list(range(10100, 10150)) def test_run_get_dataframe(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) df = run.get_dataframe(fields=[("*_XGM/*", "*.i[xy]Pos*")]) assert len(df.columns) == 4 assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df.columns df2 = run.get_dataframe(fields=[("*_XGM/*", "*.i[xy]Pos*")], timestamps=True) assert len(df2.columns) == 8 assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df2.columns assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos.timestamp" in df2.columns def test_file_get_array(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: arr = f.get_array('FXE_XAD_GEC/CAM/CAMERA:daqOutput', 'data.image.pixels') assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'dim_0', 'dim_1') assert arr.shape == (400, 255, 1024) assert arr.coords['trainId'][0] == 10000 def test_file_get_array_missing_trains(mock_sa3_control_data): with H5File(mock_sa3_control_data) as f: sel = f.select_trains(by_index[:6]) arr = sel.get_array( 'SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data.image.dims' ) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'dim_0') assert arr.shape == (3, 2) np.testing.assert_array_less(arr.coords['trainId'], 10006) np.testing.assert_array_less(10000, arr.coords['trainId']) def test_file_get_array_control_roi(mock_sa3_control_data): with H5File(mock_sa3_control_data) as f: sel = f.select_trains(by_index[:6]) arr = sel.get_array( 'SA3_XTD10_VAC/DCTRL/D6_APERT_IN_OK', 'interlock.a1.AActCommand.value', roi=by_index[:25], ) assert isinstance(arr, DataArray) assert arr.shape == (6, 25) assert arr.coords['trainId'][0] == 10000 @pytest.mark.parametrize('name_in, name_out', [ (None, 'SA1_XTD2_XGM/DOOCS/MAIN:output.data.intensityTD'), ('SA1_XGM', 'SA1_XGM') ], ids=['defaultName', 'explicitName']) def test_run_get_array(mock_fxe_raw_run, name_in, name_out): run = RunDirectory(mock_fxe_raw_run) arr = run.get_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', extra_dims=['pulse'], name=name_in ) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'pulse') assert arr.shape == (480, 1000) assert arr.coords['trainId'][0] == 10000 assert arr.name == name_out def test_run_get_array_empty(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) arr = run.get_array('FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput', 'data.image.pixels') assert isinstance(arr, DataArray) assert arr.dims[0] == 'trainId' assert arr.shape == (0, 255, 1024) def test_run_get_array_error(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) with pytest.raises(SourceNameError): run.get_array('bad_name', 'data.intensityTD') with pytest.raises(PropertyNameError): run.get_array('SA1_XTD2_XGM/DOOCS/MAIN:output', 'bad_name') def test_run_get_array_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel = run.select_trains(by_id[10100:10150]) arr = sel.get_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', extra_dims=['pulse'] ) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'pulse') assert arr.shape == (50, 1000) assert arr.coords['trainId'][0] == 10100 def test_run_get_array_roi(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) arr = run.get_array('SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', extra_dims=['pulse'], roi=by_index[:16]) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'pulse') assert arr.shape == (480, 16) assert arr.coords['trainId'][0] == 10000 def test_run_get_array_multiple_per_train(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel = run.select_trains(np.s_[:2]) arr = sel.get_array( 'FXE_DET_LPD1M-1/DET/6CH0:xtdf', 'image.data', roi=np.s_[:, 10:20, 20:40] ) assert isinstance(arr, DataArray) assert arr.shape == (256, 1, 10, 20) np.testing.assert_array_equal(arr.coords['trainId'], np.repeat([10000, 10001], 128)) def test_run_get_virtual_dataset(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) ds = run.get_virtual_dataset('FXE_DET_LPD1M-1/DET/6CH0:xtdf', 'image.data') assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape == (61440, 1, 256, 256) # Across two sequence files ds = run.get_virtual_dataset( 'FXE_XAD_GEC/CAM/CAMERA:daqOutput', 'data.image.pixels' ) assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape == (480, 255, 1024) def test_run_get_virtual_dataset_filename(mock_fxe_raw_run, tmpdir): run = RunDirectory(mock_fxe_raw_run) path = str(tmpdir / 'test-vds.h5') ds = run.get_virtual_dataset( 'FXE_DET_LPD1M-1/DET/6CH0:xtdf', 'image.data', filename=path ) assert_isfile(path) assert ds.file.filename == path assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape == (61440, 1, 256, 256) def test_run_get_dask_array(mock_fxe_raw_run): import dask.array as da run = RunDirectory(mock_fxe_raw_run) arr = run.get_dask_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', ) assert isinstance(arr, da.Array) assert arr.shape == (480, 1000) assert arr.dtype == np.float32 def test_run_get_dask_array_labelled(mock_fxe_raw_run): import dask.array as da run = RunDirectory(mock_fxe_raw_run) arr = run.get_dask_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', labelled=True ) assert isinstance(arr, DataArray) assert isinstance(arr.data, da.Array) assert arr.dims == ('trainId', 'dim_0') assert arr.shape == (480, 1000) assert arr.coords['trainId'][0] == 10000 def test_select(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert 'SPB_XTD9_XGM/DOOCS/MAIN' in run.control_sources # Basic selection machinery, glob API sel = run.select('*/DET/*', 'image.pulseId') assert 'SPB_XTD9_XGM/DOOCS/MAIN' not in sel.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel.instrument_sources _, data = sel.train_from_id(10000) for source, source_data in data.items(): assert set(source_data.keys()) == {'image.pulseId', 'metadata'} sel_by_list = run.select([ ('*/DET/*', 'image.pulseId'), 'FXE_XAD_GEC/CAM/*', ]) assert 'SPB_XTD9_XGM/DOOCS/MAIN' not in sel_by_list.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel_by_list.instrument_sources assert sel_by_list['FXE_DET_LPD1M-1/DET/0CH0:xtdf'].keys() == {'image.pulseId'} cam_src = 'FXE_XAD_GEC/CAM/CAMERA_NODATA' assert cam_src in sel_by_list.control_sources assert f'{cam_src}:daqOutput' in sel_by_list.instrument_sources assert sel_by_list[cam_src].keys() == run[cam_src].keys() assert sel_by_list[f'{cam_src}:daqOutput'].keys() == run[f'{cam_src}:daqOutput'].keys() # Basic selection machinery, dict-based API sel_by_dict = run.select({ 'SA1_XTD2_XGM/DOOCS/MAIN': None, 'FXE_DET_LPD1M-1/DET/0CH0:xtdf': {'image.pulseId'} }) assert sel_by_dict.control_sources == {'SA1_XTD2_XGM/DOOCS/MAIN'} assert sel_by_dict.instrument_sources == {'FXE_DET_LPD1M-1/DET/0CH0:xtdf'} assert sel_by_dict.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') == \ sel.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') # Re-select using * selection, should yield the same keys. assert sel.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') == \ sel.select('FXE_DET_LPD1M-1/DET/0CH0:xtdf', '*') \ .keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') assert sel.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') == \ sel.select({'FXE_DET_LPD1M-1/DET/0CH0:xtdf': {}}) \ .keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') # Re-select a different but originally valid key, should fail. with pytest.raises(ValueError): # ValueError due to globbing. sel.select('FXE_DET_LPD1M-1/DET/0CH0:xtdf', 'image.trainId') with pytest.raises(PropertyNameError): # PropertyNameError via explicit key. sel.select({'FXE_DET_LPD1M-1/DET/0CH0:xtdf': {'image.trainId'}}) # Select by another DataCollection. sel_by_dc = run.select(sel) assert sel_by_dc.control_sources == sel.control_sources assert sel_by_dc.instrument_sources == sel.instrument_sources assert sel_by_dc.train_ids == sel.train_ids # Select by SourceData. sd = run['SPB_XTD9_XGM/DOOCS/MAIN'].select_keys('beamPosition.*') sel_by_sd = run.select(sd) assert sel_by_sd.control_sources == {sd.source} assert sel_by_sd.keys_for_source(sd.source) == sd.keys() # Select by KeyData. kd = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos'] sel_by_kd = run.select(kd) assert sel_by_kd.control_sources == {kd.source} assert sel_by_kd.keys_for_source(kd.source) == {kd.key} # disallow mixing source and train ID selection with pytest.raises(TypeError): run['SPB_XTD9_XGM/DOOCS/MAIN', 10] @pytest.mark.parametrize( 'select_str', ['*/BEAMVIEW2:daqOutput', '*/BEAMVIEW2*', '*', [('*/BEAMVIEW2:*', 'data.image.*')]] ) def test_select_require_all(mock_sa3_control_data, select_str): # De-select two sources in this example set which have no trains # at all as well as one other with partuial trains, to allow # matching trains across all sources with the same result. run = H5File(mock_sa3_control_data) \ .deselect([('SA3_XTD10_MCP/ADC/1:*', '*'), ('SA3_XTD10_IMGFEL/CAM/BEAMVIEW:*', '*'), ('SA3_XTD10_IMGFEL/CAM/BEAMVIEW3:*', '*')]) subrun = run.select(select_str, require_all=True) np.testing.assert_array_equal(subrun.train_ids, run.train_ids[1::2]) # The train IDs are held by ndarrays during this operation, make # sure it's a list of np.uint64 again. assert isinstance(subrun.train_ids, list) assert all([isinstance(x, np.uint64) for x in subrun.train_ids]) def test_select_require_all_empty(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) with pytest.warns(match=r"(\d+)/\1 \(100%\) trains dropped"): sel = run.select([ "*_XGM/DOOCS/MAIN:output", "FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput" ], require_all=True) assert sel.train_ids == [] def test_select_require_any(mock_sa3_control_data): run = H5File(mock_sa3_control_data) # BEAMVIEW2 has 250/500 trains, BEAMVIEW3 has 200/500 trains. # Compare the train IDs resulting from a require-any select with the # union of their respective train IDs. np.testing.assert_array_equal( run.select('*/BEAMVIEW*:daqOutput', require_any=True).train_ids, np.union1d( run.select('*/BEAMVIEW2:daqOutput', require_all=True).train_ids, run.select('*/BEAMVIEW3:daqOutput', require_all=True).train_ids )) # BEAMVIEW has no trains, should also yield an empty list. assert run.select('*/BEAMVIEW:daqOutput', require_any=True).train_ids == [] def test_deselect(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) xtd9_xgm = 'SPB_XTD9_XGM/DOOCS/MAIN' assert xtd9_xgm in run.control_sources sel = run.deselect('*_XGM/DOOCS*') assert xtd9_xgm not in sel.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel.instrument_sources sel = run.deselect('*_XGM/DOOCS*', '*.ixPos') assert xtd9_xgm in sel.control_sources assert 'beamPosition.ixPos.value' not in sel.selection[xtd9_xgm] assert 'beamPosition.iyPos.value' in sel.selection[xtd9_xgm] sel = run.deselect(run.select('*_XGM/DOOCS*')) assert xtd9_xgm not in sel.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel.instrument_sources def test_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert len(run.train_ids) == 480 sel = run.select_trains(by_id[10200:10220]) assert sel.train_ids == list(range(10200, 10220)) sel = run.select_trains(by_index[:10]) assert sel.train_ids == list(range(10000, 10010)) with catch_warnings(record=True) as w: sel = run.select_trains(by_id[9000:9100]) # Before data assert sel.train_ids == [] assert len(w) == 1 assert "before" in str(w[0].message) with catch_warnings(record=True) as w: sel = run.select_trains(by_id[12000:12500]) # After data assert sel.train_ids == [] assert len(w) == 1 assert "after" in str(w[0].message) # Select a list of train IDs sel = run.select_trains(by_id[[9950, 10000, 10101, 10500]]) assert sel.train_ids == [10000, 10101] with catch_warnings(record=True) as w: sel = run.select_trains(by_id[[9900, 10600]]) assert sel.train_ids == [] assert len(w) == 1 assert "not found" in str(w[0].message) # Select a list of indexes sel = run.select_trains(by_index[[5, 25]]) assert sel.train_ids == [10005, 10025] with pytest.raises(IndexError): run.select_trains(by_index[[480]]) assert run[10].train_ids == [10010] assert run[by_id[10000]].train_ids == [10000] assert run[by_index[479:555]].train_ids == [10479] with pytest.raises(IndexError): run[555] def test_split_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert len(run.train_ids) == 480 chunks = list(run.split_trains(3)) assert len(chunks) == 3 assert {len(c.train_ids) for c in chunks} == {160} arr = chunks[0]['FXE_XAD_GEC/CAM/CAMERA:daqOutput', 'data.image.dims'].ndarray() assert arr.shape == (160, 2) chunks = list(run.split_trains(4, trains_per_part=100)) assert len(chunks) == 5 assert {len(c.train_ids) for c in chunks} == {96} def test_train_timestamps(mock_scs_run): run = RunDirectory(mock_scs_run) tss = run.train_timestamps(labelled=False) assert isinstance(tss, np.ndarray) assert tss.shape == (len(run.train_ids),) assert tss.dtype == np.dtype('datetime64[ns]') assert np.all(np.diff(tss).astype(np.uint64) > 0) # Convert numpy datetime64[ns] to Python datetime (dropping some precision) tss_l = run.train_timestamps(pydatetime=True) assert len(tss_l) == len(run.train_ids) now = datetime.now(timezone.utc) assert tss_l[0] > (now - timedelta(days=1)) # assuming tests take < 1 day to run assert tss_l[0] < now assert tss_l[0].tzinfo is timezone.utc tss_ser = run.train_timestamps(labelled=True) assert isinstance(tss_ser, pd.Series) np.testing.assert_array_equal(tss_ser.values, tss) np.testing.assert_array_equal(tss_ser.index, run.train_ids) assert tss_ser.dt.tz is timezone.utc def test_train_timestamps_local_time(mock_scs_run): run = RunDirectory(mock_scs_run) del1h = timedelta(hours=1) del2h = timedelta(hours=2) # First, the pydatetime case tss_berlin = run.train_timestamps(pydatetime=True, euxfel_local_time=True) # The time difference between UTC and Europe/Berlin can only be # one or two hours depending on daylight savings assert all( t1.utcoffset() == del1h or t1.utcoffset() == del2h for t1 in tss_berlin ) # Second, the pandas (labelled=True) case tss = run.train_timestamps(labelled=True) tss_berlin = run.train_timestamps(labelled=True, euxfel_local_time=True) dtss = tss_berlin.dt.tz_localize(None) - tss.dt.tz_localize(None) assert all(dtss == del1h) or all(dtss == del2h) # Finally, check that ValueError is raised if euxfel_local_time is used # on its own with pytest.raises(ValueError): run.train_timestamps(pydatetime=False, labelled=False, euxfel_local_time=True) def test_train_timestamps_nat(mock_fxe_control_data): f = H5File(mock_fxe_control_data) tss = f.train_timestamps() assert tss.shape == (len(f.train_ids),) if f.files[0].format_version == '0.5': assert np.all(np.isnat(tss)) else: assert not np.any(np.isnat(tss)) tss_l = f.train_timestamps(pydatetime=True) assert len(tss_l) == len(f.train_ids) if f.files[0].format_version == '0.5': assert all(t is None for t in tss_l) else: assert not any(t is None for t in tss_l) def test_union(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) xgm = "SPB_XTD9_XGM/DOOCS/MAIN" camera = "FXE_XAD_GEC/CAM/CAMERA" # Test union of different sources sel1 = run.select(xgm, 'beamPosition.ixPos') sel2 = run.select(xgm, 'beamPosition.iyPos') joined = sel1.union(sel2) assert joined.control_sources == { xgm } assert joined.selection == { xgm: { 'beamPosition.ixPos.value', 'beamPosition.iyPos.value', } } # Test union of different train selections sel1 = run.select_trains(by_id[10200:10220]) sel2 = run.select_trains(by_index[:10]) joined = sel1.union(sel2) assert joined.train_ids == list(range(10000, 10010)) + list(range(10200, 10220)) # Test union of different sources in different train selections sel1 = run.select(xgm).select_trains(by_index[:5]) sel2 = run.select(camera).select_trains(by_index[-5:]) joined = sel1.union(sel2) expected_tids = run.train_ids[:5] + run.train_ids[-5:] assert joined.train_ids == expected_tids assert joined[xgm].train_ids == expected_tids assert joined[camera].train_ids == expected_tids # Try via operators. sel1 = run.select(xgm, 'beamPosition.ixPos') sel2 = run.select(xgm, 'beamPosition.iyPos') joined = sel1 | sel2 assert joined.selection == { xgm: { 'beamPosition.ixPos.value', 'beamPosition.iyPos.value', } } sel1 |= sel2 assert sel1.selection == { xgm: { 'beamPosition.ixPos.value', 'beamPosition.iyPos.value', } } def test_union_raw_proc(mock_spb_raw_run, mock_spb_proc_run): raw_run = RunDirectory(mock_spb_raw_run) proc_run = RunDirectory(mock_spb_proc_run) run = raw_run.deselect('*AGIPD1M*').union(proc_run) assert run.all_sources == (raw_run.all_sources | proc_run.all_sources) if raw_run.run_metadata()['dataFormatVersion'] != '0.5': assert run.is_single_run def test_union_multiple_runs(mock_spb_raw_run, mock_jungfrau_run, mock_scs_run): run_spb = RunDirectory(mock_spb_raw_run) run_jf = RunDirectory(mock_jungfrau_run) run_scs = RunDirectory(mock_scs_run) assert run_spb.is_single_run assert run_jf.is_single_run assert run_scs.is_single_run # Union in one go u1 = run_spb.union(run_jf, run_scs) assert u1.all_sources == (run_spb.all_sources | run_jf.all_sources | run_scs.all_sources) assert not u1.is_single_run # Union in two steps u2 = run_scs.union(run_jf).union(run_spb) assert u2.all_sources == u1.all_sources assert not u1.is_single_run def test_read_skip_invalid(mock_lpd_data, empty_h5_file, capsys): d = DataCollection.from_paths([mock_lpd_data, empty_h5_file]) assert d.instrument_sources == {'FXE_DET_LPD1M-1/DET/0CH0:xtdf'} out, err = capsys.readouterr() assert "Skipping file" in err def test_run_immutable_sources(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) before = len(test_run.all_sources) with pytest.raises(AttributeError): test_run.all_sources.pop() assert len(test_run.all_sources) == before def test_open_file(mock_sa3_control_data): f = H5File(mock_sa3_control_data) file_access = f.files[0] assert file_access.format_version in ('0.5', '1.0', '1.2') assert 'SA3_XTD10_VAC/TSENS/S30180K' in f.control_sources if file_access.format_version == '0.5': assert 'METADATA/dataSourceId' in file_access.file else: assert 'METADATA/dataSources/dataSourceId' in file_access.file @pytest.mark.skipif(hasattr(os, 'geteuid') and os.geteuid() == 0, reason="cannot run permission tests as root") def test_permission(): d = mkdtemp() os.chmod(d, not stat.S_IRUSR) with pytest.raises(PermissionError) as excinfo: run = RunDirectory(d) assert "Permission denied" in str(excinfo.value) assert d in str(excinfo.value) def test_empty_file_info(mock_empty_file, capsys): f = H5File(mock_empty_file) f.info() # smoke test def test_get_data_counts(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) count = run.get_data_counts('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') assert count.index.tolist() == run.train_ids assert (count.values == 1).all() def test_get_run_value(mock_fxe_control_data): f = H5File(mock_fxe_control_data) src = 'FXE_XAD_GEC/CAM/CAMERA' val = f.get_run_value(src, 'firmwareVersion') assert isinstance(val, np.int32) assert f.get_run_value(src, 'firmwareVersion.value') == val with pytest.raises(SourceNameError): f.get_run_value(src + '_NONEXIST', 'firmwareVersion') with pytest.raises(PropertyNameError): f.get_run_value(src, 'non.existant') def test_get_run_value_union_multirun(mock_fxe_control_data, mock_fxe_control_data1): f = H5File(mock_fxe_control_data) f2 = H5File(mock_fxe_control_data1) data = f.union(f2) with pytest.raises(MultiRunError): data.run_metadata() with pytest.raises(MultiRunError): data.get_run_value('FXE_XAD_GEC/CAM/CAMERA', 'firmwareVersion') with pytest.raises(MultiRunError): data.get_run_values('FXE_XAD_GEC/CAM/CAMERA') def test_get_run_value_union(mock_fxe_control_data, mock_sa3_control_data): f = H5File(mock_fxe_control_data) f2 = H5File(mock_sa3_control_data) data = f.union(f2) if data.files[0].format_version != '0.5': assert data.get_run_value( 'FXE_XAD_GEC/CAM/CAMERA', 'firmwareVersion') == 0 assert ( data.run_metadata()["runNumber"] == f.run_metadata()["runNumber"] == f2.run_metadata()["runNumber"] ) def test_get_run_values(mock_fxe_control_data): f = H5File(mock_fxe_control_data) src = 'FXE_XAD_GEC/CAM/CAMERA' d = f.get_run_values(src, ) assert isinstance(d['firmwareVersion.value'], np.int32) assert isinstance(d['enableShutter.value'], np.uint8) def test_get_run_values_no_trains(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) sel = run.select_trains(np.s_[:0]) d = sel.get_run_values('SPB_IRDA_JF4M/MDL/POWER') assert isinstance(d['voltage.value'], np.float64) def test_inspect_key_no_trains(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) sel = run.select_trains(np.s_[:0]) # CONTROL jf_pwr_voltage = sel['SPB_IRDA_JF4M/MDL/POWER', 'voltage'] assert jf_pwr_voltage.shape == (0,) assert jf_pwr_voltage.dtype == np.dtype(np.float64) # INSTRUMENT jf_m1_data = sel['SPB_IRDA_JF4M/DET/JNGFR01:daqOutput', 'data.adc'] assert jf_m1_data.shape == (0, 16, 512, 1024) assert jf_m1_data.dtype == np.dtype(np.float32) def test_run_metadata(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) md = run.run_metadata() if run.files[0].format_version == '0.5': assert md == {'dataFormatVersion': '0.5'} else: assert md['dataFormatVersion'] in ('1.0', '1.2') assert set(md) == { 'dataFormatVersion', 'creationDate', 'updateDate', 'daqLibrary', 'karaboFramework', 'proposalNumber', 'runNumber', 'runType', 'sample', 'sequenceNumber', } assert isinstance(md['creationDate'], str) def test_run_metadata_no_trains(mock_scs_run): run = RunDirectory(mock_scs_run) sel = run.select_trains(np.s_[:0]) md = sel.run_metadata() assert md['dataFormatVersion'] == '1.0' def test_proc_legacy_sources(mock_modern_spb_proc_run): run = RunDirectory(mock_modern_spb_proc_run) src_pattern = 'SPB_DET_AGIPD1M-1/{}/{}CH0:{}' corr_sources = {src_pattern.format('CORR', i, 'output') for i in range(16)} det_sources = {src_pattern.format('DET', i, 'xtdf') for i in range(16)} # Should contain both canonical and legacy names. assert run.all_sources == corr_sources | det_sources assert run.instrument_sources == corr_sources | det_sources assert not run.control_sources # Should only contain canonical names. assert run.detector_sources == corr_sources # Should map legacy to canonical names. assert run.legacy_sources == dict(zip( sorted(det_sources), sorted(corr_sources))) det_mod0 = src_pattern.format('DET', 0, 'xtdf') # Classic APIs continue to work as normal, but raise warnings # whenever data is accessed through creation of SourceData object. with pytest.warns(DeprecationWarning): assert 'image.data' in run.keys_for_source(det_mod0) with pytest.warns(DeprecationWarning): assert run.get_dtype(det_mod0, 'image.data') == np.float32 assert run.select(det_mod0).all_sources == {det_mod0} def test_datacollection_contains(mock_fxe_control_data): run = H5File(mock_fxe_control_data) assert 'FXE_XAD_GEC/CAM/CAMERA:daqOutput' in run assert 'MY/LITTLE/PONY' not in run assert ('MY/LITTLE/PONY', 'actualPosition') not in run assert ('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos') in run assert ('SPB_XTD9/XGM/DOOCS/MAIN', '42') not in run assert 42 not in run ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_run_files_map.py0000644000175100001660000000516514757376472023060 0ustar00runnerdockerimport h5py import numpy as np import os import pytest import unittest.mock as mock from .mockdata import write_file from .mockdata.xgm import XGM from extra_data import run_files_map, RunDirectory def test_candidate_paths(tmp_path): # 'real' paths (like /gpfs/exfel/d) prop_raw_path = tmp_path / 'raw' / 'FXE' / '201901' / 'p001234' run_dir = prop_raw_path / 'r0450' run_dir.mkdir(parents=True) # stable paths (like /gpfs/exfel/exp) exp = tmp_path / 'exp' prop_dir = exp / 'FXE' / '201901' / 'p001234' prop_scratch = exp / 'FXE' / '201901' / 'p001234' / 'scratch' prop_scratch.mkdir(parents=True) (prop_dir / 'raw').symlink_to(prop_raw_path) run_in_exp = prop_dir / 'raw' / 'r0450' with mock.patch.object(run_files_map, 'SCRATCH_ROOT_DIR', str(exp)): rfm = run_files_map.RunFilesMap(str(run_dir)) rfm_exp = run_files_map.RunFilesMap(str(run_in_exp)) assert rfm.candidate_paths == [ str(run_dir / 'karabo_data_map.json'), str(prop_scratch / '.karabo_data_maps' / 'raw_r0450.json'), ] assert rfm_exp.candidate_paths == [ str(run_in_exp / 'karabo_data_map.json'), str(prop_scratch / '.karabo_data_maps' / 'raw_r0450.json'), ] @pytest.fixture() def run_with_extra_file(mock_fxe_raw_run): extra_file = os.path.join(mock_fxe_raw_run, 'RAW-R0450-DA02-S00000.h5') write_file(extra_file, [ XGM('FXE_TEST_XGM/DOOCS/MAIN'), ], ntrains=480) try: yield mock_fxe_raw_run, extra_file finally: os.unlink(extra_file) def test_save_load_map(run_with_extra_file, tmp_path): run_dir, extra_file = run_with_extra_file run_map_path = str(tmp_path / 'kd_test_run_map.json') class TestRunFilesMap(run_files_map.RunFilesMap): def map_paths_for_run(self, directory): return [run_map_path] rfm = TestRunFilesMap(run_dir) assert rfm.files_data == {} with RunDirectory(run_dir) as run: rfm.save(run.files) rfm2 = TestRunFilesMap(run_dir) assert rfm2.cache_file == run_map_path file_info = rfm2.get(extra_file) assert isinstance(file_info['train_ids'], np.ndarray) assert isinstance(file_info['control_sources'], frozenset) assert isinstance(file_info['instrument_sources'], frozenset) assert isinstance(file_info['flag'], np.ndarray) np.testing.assert_array_equal(file_info['flag'], True) # Modify a file; this should make the cache invalid with h5py.File(extra_file, 'r+') as f: f.attrs['test_save_load_map'] = 1 rfm3 = TestRunFilesMap(run_dir) assert rfm3.cache_file == run_map_path assert rfm3.get(extra_file) is None ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_slice_objs.py0000644000175100001660000000077714757376472022355 0ustar00runnerdockerfrom extra_data import by_id, by_index def test_slicing_reprs(): ns = {'by_id': by_id, 'by_index': by_index} samples = [ 'by_id[:]', 'by_id[:2]', 'by_id[0:10:2]', 'by_id[4::2, 7]', 'by_index[:5, 3:12]', 'by_index[-4:, ...]', 'by_index[...]', 'by_index[..., ::-1]', ] # These examples are canonically formatted, so their repr() should match for expr in samples: obj = eval(expr, ns) assert repr(obj) == expr ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_sourcedata.py0000644000175100001660000002664114757376472022371 0ustar00runnerdockerimport numpy as np import pytest from extra_data import RunDirectory, by_id, by_index from extra_data.exceptions import PropertyNameError, SourceNameError, NoDataError def test_get_sourcedata(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] assert len(am0.files) == 1 assert am0.section == 'INSTRUMENT' assert am0.is_instrument assert not am0.is_control assert am0.index_groups == {'header', 'detector', 'image', 'trailer'} xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert len(xgm.files) == 2 assert xgm.section == 'CONTROL' assert xgm.is_control assert not xgm.is_instrument assert xgm.index_groups == {''} def test_keys(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] # Control keys can omit .value suffix, but .keys() will not list that. assert 'beamPosition.ixPos.value' in xgm assert 'beamPosition.ixPos' in xgm assert 'beamPosition.ixPos.value' in xgm.keys() assert 'beamPosition.ixPos.timestamp' in xgm.keys() assert 'beamPosition.ixPos' not in xgm.keys() assert xgm['beamPosition.ixPos.value'].dtype == np.dtype('f4') assert xgm['beamPosition.ixPos'].dtype == np.dtype('f4') # .keys(inc_timestamp=False) will give us only the name before '.value' assert 'beamPosition.ixPos.value' not in xgm.keys(inc_timestamps=False) assert 'beamPosition.ixPos.timestamp' not in xgm.keys(inc_timestamps=False) assert 'beamPosition.ixPos' in xgm.keys(inc_timestamps=False) # Recreate the run and xgm objects so we can test one_key() when the # FileAccess caches are empty. run = RunDirectory(mock_spb_raw_run) xgm = run["SPB_XTD9_XGM/DOOCS/MAIN"] # Make sure that one_key() does indeed return a valid key for # control/instrument sources. assert xgm.one_key() in xgm.keys() xgm_output = run['SPB_XTD9_XGM/DOOCS/MAIN:output'] assert xgm_output.one_key() in xgm_output.keys() # Test one_key() with index group. am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] assert am0.one_key('image').startswith('image.') with pytest.raises(ValueError): # Asking for a de-selected index group. assert am0.select_keys('header.*').one_key('image') with pytest.raises(ValueError): # Not an index group of this source. assert am0.one_key('data') def test_select_keys(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] # Select exact key xpos_key = 'beamPosition.ixPos.value' assert xgm.select_keys('beamPosition.ixPos.value').keys() == {xpos_key} assert xgm.select_keys('beamPosition.ixPos').keys() == {xpos_key} assert xgm.select_keys({'beamPosition.ixPos.value'}).keys() == {xpos_key} assert xgm.select_keys({'beamPosition.ixPos'}).keys() == {xpos_key} # Select all keys all_keys = xgm.keys() assert xgm.select_keys(set()).keys() == all_keys assert xgm.select_keys(None).keys() == all_keys assert xgm.select_keys('*').keys() == all_keys # Select keys with glob pattern beampos_keys = { 'beamPosition.ixPos.value', 'beamPosition.ixPos.timestamp', 'beamPosition.iyPos.value', 'beamPosition.iyPos.timestamp' } assert xgm.select_keys('beamPosition.*').keys() == beampos_keys assert xgm.select_keys('beamPosition.*').select_keys('*').keys() == beampos_keys # select keys on INSTRUMENT data am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] key = 'image.data' assert am0.select_keys(key).keys() == {key} assert am0.select_keys('*').keys() == am0.keys() with pytest.raises(PropertyNameError): am0.select_keys('data.image') def test_select_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert len(xgm.train_ids) == 64 sel = xgm.select_trains(by_id[10020:10040]) assert sel.train_ids == list(range(10020, 10040)) sel = xgm.select_trains(by_index[:10]) assert sel.train_ids == list(range(10000, 10010)) sel = xgm.select_trains(by_index[999995:999999]) assert sel.train_ids == [] assert sel.keys() == xgm.keys() sel = xgm[by_id[10020:10040]] assert sel.train_ids == list(range(10020, 10040)) sel = xgm[by_index[:10]] assert sel.train_ids == list(range(10000, 10010)) sel = xgm[10] assert sel.train_ids == [10010] sel = xgm[999:1000] assert sel.train_ids == [] assert sel.keys() == xgm.keys() def test_split_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert len(xgm.train_ids) == 64 chunks = list(xgm.split_trains(3)) assert len(chunks) == 3 assert {len(c.train_ids) for c in chunks} == {21, 22} # The middle chunk spans across 2 files assert [len(c.files) for c in chunks] == [1, 2, 1] chunks = list(xgm.split_trains(3, trains_per_part=20)) assert len(chunks) == 4 assert {len(c.train_ids) for c in chunks} == {16} def test_union(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] sel = xgm.select_trains(np.s_[:10]).union(xgm.select_trains(np.s_[-10:])) assert sel.train_ids == list(range(10000, 10010)) + list(range(10054, 10064)) with pytest.raises(ValueError): xgm.union(am0) sel = xgm.select_trains(np.s_[:10]) | xgm.select_trains(np.s_[-10:]) assert sel.train_ids == list(range(10000, 10010)) + list(range(10054, 10064)) sel = xgm.select_trains(np.s_[:10]) sel |= xgm.select_trains(np.s_[-10:]) assert sel.train_ids == list(range(10000, 10010)) + list(range(10054, 10064)) def test_run_value(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] value = xgm.run_value('pulseEnergy.conversion.value') assert isinstance(value, np.float64) run_dict = xgm.run_values() assert 'pulseEnergy.conversion.value' in run_dict assert 'pulseEnergy.conversion.timestamp' in run_dict values_dict = xgm.run_values(inc_timestamps=False) assert 'pulseEnergy.conversion' in values_dict assert 'pulseEnergy.conversion.timestamp' not in values_dict with pytest.raises(ValueError): # no run values for instrument sources am0.run_values() def test_device_class(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_ctrl = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert xgm_ctrl.device_class == 'DoocsXGM' xgm_inst = run['SPB_XTD9_XGM/DOOCS/MAIN:output'] assert xgm_inst.device_class is None def test_euxfel_path_infos(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert xgm.storage_class is None # Not an EuXFEL path. assert xgm.data_category == 'RAW' assert xgm.aggregator == 'DA01' # Changed to preserve the behaviour of above, as using a voview # file with 0-len datasets anyway causes a return of None. It # therefore attempts to use the regular .files property instead, # either suceeding or failing as badly as it would with a voview. run = RunDirectory(mock_spb_raw_run).select_trains(np.s_[:0]) xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert xgm.storage_class is None assert xgm.aggregator == 'DA01' @pytest.mark.parametrize('source', [ 'SPB_XTD9_XGM/DOOCS/MAIN', # Control data. 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', # Pipeline data. 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf' # XTDF data. ]) def test_data_counts_modes(mock_reduced_spb_proc_run, source): run = RunDirectory(mock_reduced_spb_proc_run) sd = run[source] import pandas as pd for index_group in [None, *sd.index_groups]: count1 = sd.data_counts(index_group=index_group) assert isinstance(count1, pd.Series) assert count1.index.tolist() == sd.train_ids count2 = sd.data_counts(labelled=False, index_group=index_group) assert isinstance(count2, np.ndarray) assert len(count2) == len(sd.train_ids) np.testing.assert_equal(count1, count2) def test_data_counts_values(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) # control data xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] assert (xgm.data_counts().values == 1).all() with pytest.raises(ValueError): xgm.data_counts(index_group='data') # instrument data camera = run['SPB_IRU_CAM/CAM/SIDEMIC:daqOutput'] assert (camera.data_counts().values == 1).all() with pytest.raises(ValueError): camera.data_counts(index_group='not-data') am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] num_images = am0['image.data'].shape[0] assert am0.data_counts().values.sum() >= num_images assert am0.data_counts(index_group='image').values.sum() == num_images with pytest.raises(ValueError): am0.data_counts(index_group='preamble') def test_drop_empty_trains(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] # Compare all index groups with `require_any`. np.testing.assert_equal( am0.drop_empty_trains().train_ids, run.select(am0.source, '*', require_any=True).train_ids) # Compare one specific index group with `require_all`. np.testing.assert_equal( am0.drop_empty_trains(index_group='image').train_ids, run.select(am0.source, 'image.*', require_all=True).train_ids) with pytest.raises(ValueError): am0.drop_empty_trains(index_group='preamble') def test_train_id_coordinates(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) # control data. xgm = run['SPB_XTD9_XGM/DOOCS/MAIN'] np.testing.assert_equal( xgm.train_id_coordinates(), xgm.train_id_coordinates('')) np.testing.assert_equal( xgm.train_id_coordinates(), xgm['pulseEnergy.conversion'].train_id_coordinates()) with pytest.raises(ValueError): xgm.train_id_coordinates('data') # instrument data. camera = run['SPB_IRU_CAM/CAM/SIDEMIC:daqOutput'] np.testing.assert_equal( camera.train_id_coordinates(), camera.train_id_coordinates('data')) np.testing.assert_equal( camera.train_id_coordinates(), camera['data.image.pixels'].train_id_coordinates()) with pytest.raises(ValueError): camera.train_id_coordinates('image') # xtdf data. am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf'] np.testing.assert_equal( am0.train_id_coordinates('header'), am0['header.pulseCount'].train_id_coordinates()) np.testing.assert_equal( am0.train_id_coordinates('image'), am0['image.data'].train_id_coordinates()) # Should fail due to multiple index groups with differing counts. with pytest.raises(ValueError): am0.train_id_coordinates() def test_legacy_sourcedata(mock_modern_spb_proc_run): run = RunDirectory(mock_modern_spb_proc_run) det_mod0 = 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf' corr_mod0 = 'SPB_DET_AGIPD1M-1/CORR/0CH0:output' # True (canonical) source works as normal sd = run[corr_mod0] assert sd.canonical_name == corr_mod0 assert not sd.is_legacy # Obtaining SourceData object via legacy name emits a warning. with pytest.warns(DeprecationWarning): sd = run[det_mod0] assert sd.source == det_mod0 assert sd.canonical_name == corr_mod0 assert sd.is_legacy ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_stacking.py0000644000175100001660000001516514757376472022041 0ustar00runnerdockerimport numpy as np import pytest # numpy.exceptions exists from 1.25 onwards, but for Python 3.8 we still support # numpy 1.24. We can clean this up once we require Python >= 3.9. try: from numpy.exceptions import AxisError except ImportError: from numpy import AxisError from extra_data import RunDirectory, stack_data, stack_detector_data from extra_data.stacking import StackView def test_stack_data(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('FXE_DET_LPD1M-1/DET/*', 'image.data')]) comb = stack_data(data, 'image.data') assert comb.shape == (128, 1, 16, 256, 256) def test_stack_detector_data(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*1M*/DET/*', 'image.data')]) comb = stack_detector_data(data, 'image.data') assert comb.shape == (128, 1, 16, 256, 256) def test_stack_detector_data_missing(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) # Three variants of missing data: # 1. Source missing del data['FXE_DET_LPD1M-1/DET/3CH0:xtdf'] # 2. Key missing del data['FXE_DET_LPD1M-1/DET/7CH0:xtdf']['image.data'] # 3. Empty array missing = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (1, 5, 9, 15)] for module in missing: data[module]['image.data'] = np.zeros((0, 1, 256, 256), dtype=np.uint16) comb = stack_detector_data(data, 'image.data', fillvalue=22) assert comb.shape == (128, 1, 16, 256, 256) assert not (comb[:, :, 0] == 22).any() # Control assert (comb[:, :, 3] == 22).all() # Source missing assert (comb[:, :, 7] == 22).all() # Key missing assert (comb[:, :, 5] == 22).all() # Empty array # default fillvalue for int is 0 comb = stack_detector_data(data, 'image.data') assert (comb[:, :, 3] == 0).all() with pytest.raises(ValueError): comb = stack_detector_data(data, 'image.data', fillvalue=np.nan) def test_stack_detector_data_stackview(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) # Three variants of missing data: # 1. Source missing del data['FXE_DET_LPD1M-1/DET/3CH0:xtdf'] # 2. Key missing del data['FXE_DET_LPD1M-1/DET/7CH0:xtdf']['image.data'] # 3. Empty array missing = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (1, 5, 9, 15)] for module in missing: data[module]['image.data'] = np.zeros((0, 1, 256, 256), dtype=np.uint16) comb = stack_detector_data(data, 'image.data', fillvalue=22, real_array=False) assert comb.shape == (128, 1, 16, 256, 256) assert not (comb[:, :, 0] == 22).any() # Control assert (comb[:, :, 3] == 22).all() # Source missing assert (comb[:, :, 7] == 22).all() # Key missing assert (comb[:, :, 5] == 22).all() # Empty array # Slice across all modules pulse = comb[0, 0] assert pulse.shape == (16, 256, 256) assert not (pulse[0] == 22).any() assert (pulse[3] == 22).all() assert (pulse[7] == 22).all() assert (pulse[5] == 22).all() pulse_arr = pulse.asarray() assert pulse_arr.shape == (16, 256, 256) assert pulse_arr.max() == 22 assert pulse_arr.min() == 0 def test_stack_detector_data_wrong_pulses(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) misshaped = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (12, 13)] for module in misshaped: data[module]['image.data'] = np.zeros((64, 1, 256, 256), dtype=np.uint16) with pytest.raises(ValueError) as excinfo: comb = stack_detector_data(data, 'image.data') assert '(64, 1, 256, 256)' in str(excinfo.value) def test_stack_detector_data_wrong_shape(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) misshaped = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (0, 15)] for module in misshaped: data[module]['image.data'] = np.zeros((128, 1, 512, 128), dtype=np.uint16) with pytest.raises(ValueError) as excinfo: comb = stack_detector_data(data, 'image.data') assert '(128, 1, 512, 128)' in str(excinfo.value) def test_stack_detector_data_type_error(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) module = 'FXE_DET_LPD1M-1/DET/3CH0:xtdf' data[module]['image.data'] = data[module]['image.data'].astype(np.float32) with pytest.raises(ValueError) as excinfo: comb = stack_detector_data(data, 'image.data') assert "dtype('float32')" in str(excinfo.value) def test_stack_detector_data_extra_mods(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) data.setdefault( 'FXE_DET_LPD1M-1/DET/16CH0:xtdf', {'image.data': np.zeros((128, 1, 256, 256), dtype=np.uint16)}, ) with pytest.raises(IndexError) as excinfo: comb = stack_detector_data(data, 'image.data') assert "16" in str(excinfo.value) def test_stack_detector_data_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) _, data = run.select('*JF4M/DET/*', 'data.adc').train_from_index(0) comb = stack_detector_data( data, 'data.adc', modules=8, pattern=r'/DET/JNGFR(\d+)', starts_at=1 ) assert comb.shape == (16, 8, 512, 1024) def test_stack_detector_data_jungfrau_keep_dims(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) _, data = run.select('*JF4M/DET/*', 'data.adc').train_from_index(0, keep_dims=True) comb = stack_detector_data( data, 'data.adc', modules=8, pattern=r'/DET/JNGFR(\d+)', starts_at=1 ) assert comb.shape == (1, 16, 8, 512, 1024) def test_stackview_squeeze(): # Squeeze not dropping stacking dim data = {0: np.zeros((1, 4)), 1: np.zeros((1, 4))} sv = StackView(data, 2, (1, 4), data[0], 0, stack_axis=0) assert sv.shape == (2, 1, 4) assert sv.squeeze().shape == (2, 4) # Squeeze dropping stacking dim data = {0: np.zeros((1, 4))} sv = StackView(data, 1, (1, 4), data[0].dtype, 0, stack_axis=0) assert sv.shape == (1, 1, 4) assert sv.squeeze().shape == (4,) assert sv.squeeze(axis=0).shape == (1, 4) assert sv.squeeze(axis=-2).shape == (1, 4) with pytest.raises(AxisError): sv.squeeze(axis=4) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_streamer.py0000644000175100001660000001047514757376472022057 0ustar00runnerdocker"""Test streaming data with ZMQ interface.""" import os import signal from subprocess import PIPE, Popen, TimeoutExpired import numpy as np import pytest from extra_data import by_id, H5File, RunDirectory from extra_data.export import _iter_trains, ZMQStreamer from karabo_bridge import Client def test_merge_detector(mock_fxe_raw_run, mock_fxe_control_data, mock_spb_proc_run): with RunDirectory(mock_fxe_raw_run) as run: for tid, data in _iter_trains(run, merge_detector=True): assert 'FXE_DET_LPD1M-1/DET/APPEND' in data assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' not in data shape = data['FXE_DET_LPD1M-1/DET/APPEND']['image.data'].shape assert shape == (128, 1, 16, 256, 256) break for tid, data in _iter_trains(run): assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in data shape = data['FXE_DET_LPD1M-1/DET/0CH0:xtdf']['image.data'].shape assert shape == (128, 1, 256, 256) break with H5File(mock_fxe_control_data) as run: for tid, data in _iter_trains(run, merge_detector=True): assert frozenset(data) == run.select_trains(by_id[[tid]]).all_sources break with RunDirectory(mock_spb_proc_run) as run: for tid, data in _iter_trains(run, merge_detector=True): shape = data['SPB_DET_AGIPD1M-1/DET/APPEND']['image.data'].shape assert shape == (64, 16, 512, 128) shape = data['SPB_DET_AGIPD1M-1/DET/APPEND']['image.gain'].shape assert shape == (64, 16, 512, 128) shape = data['SPB_DET_AGIPD1M-1/DET/APPEND']['image.mask'].shape assert shape == (64, 16, 512, 128) break def cleanup_proc(p: Popen): if p.poll() is None: p.send_signal(signal.SIGINT) try: p.wait(timeout=2) except TimeoutExpired: pass if p.poll() is None: p.kill() rc = p.wait(timeout=2) assert rc == -9 # process terminated by kill signal @pytest.mark.skipif(os.name != 'posix', reason="Test uses Unix socket") def test_serve_files(mock_fxe_raw_run, tmp_path): src = 'FXE_XAD_GEC/CAM/CAMERA:daqOutput' args = ['karabo-bridge-serve-files', '-z', 'PUSH', str(mock_fxe_raw_run), f'ipc://{tmp_path}/socket', '--source', src] interface = None p = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=dict(os.environ, PYTHONUNBUFFERED='1')) try: for line in p.stdout: line = line.decode('utf-8') if line.startswith('Streamer started on:'): interface = line.partition(':')[2].strip() break print('interface:', interface) assert interface is not None, p.stderr.read().decode() with Client(interface, sock='PULL', timeout=30) as c: data, meta = c.next() tid = next(m['timestamp.tid'] for m in meta.values()) assert tid == 10000 assert set(data) == {src} finally: cleanup_proc(p) @pytest.mark.skipif(os.name != 'posix', reason="Test uses Unix socket") def test_serve_run(mock_spb_raw_and_proc_run, tmp_path): mock_data_root, _, _ = mock_spb_raw_and_proc_run zmq_endpoint = f'ipc://{tmp_path}/socket' xgm_src = 'SPB_XTD9_XGM/DOOCS/MAIN' agipd_m0_src = 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf' args = ['karabo-bridge-serve-run', '2012', '238', '--port', zmq_endpoint, '--include', f'{xgm_src}[beamPosition.i*Pos]', '--include', '*AGIPD1M-1/DET/0CH0:xtdf' ] p = Popen(args, env=dict( os.environ, PYTHONUNBUFFERED='1', EXTRA_DATA_DATA_ROOT=mock_data_root )) try: with Client(zmq_endpoint, timeout=30) as c: data, meta = c.next() tid = next(m['timestamp.tid'] for m in meta.values()) assert tid == 10000 assert set(data) == {xgm_src, agipd_m0_src} assert set(data[xgm_src]) == \ {f'beamPosition.i{xy}Pos.value' for xy in 'xy'} | {'metadata'} assert data[agipd_m0_src]['image.data'].dtype == np.float32 finally: cleanup_proc(p) def test_deprecated_server(): with pytest.deprecated_call(): with ZMQStreamer(2222): pass if __name__ == '__main__': pytest.main(["-v"]) print("Run 'py.test -v -s' to see more output") ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_validation.py0000644000175100001660000001556114757376472022370 0ustar00runnerdockerimport os.path as osp from pathlib import Path from h5py import File import numpy as np from pytest import fixture, raises from tempfile import TemporaryDirectory from extra_data.validation import FileAccess, FileValidator, RunValidator, ValidationError, main from . import make_examples @fixture(scope='function') def agipd_file(): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0239-AGIPD00-S00000.h5') make_examples.make_agipd_file(path) yield path @fixture(scope='function') def data_aggregator_file(): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') make_examples.make_fxe_da_file(path) yield path def test_validate_run(mock_fxe_raw_run): rv = RunValidator(mock_fxe_raw_run) rv.validate() def test_file_error(mock_fxe_raw_run): not_readable = Path(mock_fxe_raw_run) / 'notReadable.h5' not_readable.touch(mode=0o066) problems = RunValidator(mock_fxe_raw_run).run_checks() assert len(problems) == 1 assert problems[0]['msg'] == 'Could not access file' assert problems[0]['file'] == str(not_readable) def test_zeros_in_train_ids(agipd_file): with File(agipd_file, 'r+') as f: # introduce zeros in trainId f['/INDEX/trainId'][12] = 0 with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems[0] assert problem['msg'] == 'Zeroes in trainId index before last train ID' assert problem['dataset'] == 'INDEX/trainId' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_non_strictly_increasing_train_ids(agipd_file): with File(agipd_file, 'r+') as f: # introduce non strictly increasing trainId f['/INDEX/trainId'][10] = 11010 f['/INDEX/trainId'][20] = 5 with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Train IDs are not strictly increasing, e.g. at 10 (11010 >= 10011)' assert problem['dataset'] == 'INDEX/trainId' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_index_pointing_outside_dataset(data_aggregator_file): with File(data_aggregator_file, 'r+') as f: # index pointing outside dataset f['/INDEX/FXE_XAD_GEC/CAM/CAMERA:daqOutput/data/first'][30] = 999 with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() assert 'Index referring to data (1000) outside dataset (400)' in str(excinfo.value) def test_invalid_first_dataset(data_aggregator_file): with File(data_aggregator_file, 'a') as f: # invalid first shape length = len(f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first']) f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first'].resize((length+1,)) with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Index first & count have different number of entries' assert problem['dataset'] == 'INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data' assert problem['first_shape'] == (401,) assert problem['count_shape'] == (400,) assert 'RAW-R0450-DA01-S00001.h5' in problem['file'] def test_invalid_first_and_count_dataset(data_aggregator_file): with File(data_aggregator_file, 'a') as f: # invalid first/index shape length = len(f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first']) f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first'].resize((length-1,)) length = len(f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/count']) f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/count'].resize((length-1,)) with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Index has wrong number of entries' assert problem['dataset'] == 'INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data' assert problem['index_shape'] == (399,) assert problem['trainids_shape'] == (400,) assert 'RAW-R0450-DA01-S00001.h5' in problem['file'] def test_first_dataset_not_starting_from_zero(data_aggregator_file): with File(data_aggregator_file, 'a') as f: # first index not starting at zero f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first'][0] = 1 with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() assert "Index doesn't start at 0" in str(excinfo.value) assert "INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data" in str(excinfo.value) def test_overlap(agipd_file): with File(agipd_file, 'r+') as f: # overlap first index f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/first'][1] = 0 f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/count'][1] = 128 # no gaps with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Overlaps (1) in index, e.g. at 0 (0 + 64 > 0)' assert problem['dataset'] == 'INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_gaps(agipd_file): with File(agipd_file, 'r+') as f: # gap in index f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/first'][1] = 0 f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/count'][0] = 0 with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Gaps (1) in index, e.g. at 1 (0 + 64 < 128)' assert problem['dataset'] == 'INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_file_without_data(mock_empty_file): FileValidator(FileAccess(mock_empty_file)).validate() def test_control_data_timestamps(data_aggregator_file): with File(data_aggregator_file, 'r+') as f: # control data timestamp is not in data ts = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/timestamp'] ts[:] = np.arange(len(ts)) + 1 ts[10] = 5 with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Timestamp is decreasing, e.g. at 10 (5 < 10)' assert problem['dataset'] == 'CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/timestamp' assert 'RAW-R0450-DA01-S00001.h5' in problem['file'] def test_main_file_non_h5(tmp_path, capsys): not_h5 = tmp_path / 'notHDF5.h5' not_h5.write_text("Accessible file, not HDF5") status = main([str(not_h5)]) assert status == 1 assert 'Could not open HDF5 file' in capsys.readouterr().out ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_voview.py0000644000175100001660000001245614757376472021555 0ustar00runnerdockerfrom pathlib import Path from shutil import copytree from testpath import assert_isfile from extra_data import H5File, RunDirectory, voview def test_main(mock_spb_raw_run, tmp_path): voview_file = tmp_path / 'run_overview.h5' voview.main([mock_spb_raw_run, '--overview-file', str(voview_file)]) assert_isfile(voview_file) res = voview.main([mock_spb_raw_run, '--overview-file', str(voview_file), '--check']) assert res in (0, None) def test_use_voview(mock_spb_raw_run, tmp_path): new_run_dir = tmp_path / 'r0238' copytree(mock_spb_raw_run, new_run_dir) voview_file = new_run_dir / 'overview.h5' run_orig = RunDirectory(str(new_run_dir), _use_voview=False) assert len(run_orig.files) > 1 assert voview.find_file_write(new_run_dir) == str(voview_file) vofw = voview.VirtualOverviewFileWriter(voview_file, run_orig) vofw.write() run = RunDirectory(str(new_run_dir)) assert [f.filename for f in run.files] == [str(voview_file)] assert len(run.train_ids) == 64 run_metadata = run.run_metadata() run_orig_metadata = run_orig.run_metadata() # Check the format version specifically, this is the only metadata that may # differ from the rest. if run_orig_metadata['dataFormatVersion'] != "0.5": # For all versions above 0.5 (i.e. 1.0 onwards), we write the voview # files in the 1.0 format. assert run_metadata['dataFormatVersion'] == "1.0" del run_metadata['dataFormatVersion'] del run_orig_metadata['dataFormatVersion'] else: assert run_metadata['dataFormatVersion'] == \ run_orig_metadata['dataFormatVersion'] # Check the rest of the metadata assert run_metadata == run_orig_metadata assert 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf' in run.instrument_sources assert 'SA1_XTD2_XGM/DOOCS/MAIN' in run.control_sources with RunDirectory(str(new_run_dir)) as run: assert 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf' in run.instrument_sources assert 'SA1_XTD2_XGM/DOOCS/MAIN' in run.control_sources xgm_intens = run['SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD'] assert {p.name for p in xgm_intens.source_file_paths} == { 'RAW-R0238-DA01-S00000.h5', 'RAW-R0238-DA01-S00001.h5' } assert {p.name for p in xgm_intens[:30].source_file_paths} == { 'RAW-R0238-DA01-S00000.h5' } assert {p.name for p in xgm_intens[:0].source_file_paths} == { 'RAW-R0238-DA01-S00000.h5' } assert xgm_intens.units == 'μJ' assert xgm_intens.units_name == 'microjoule' xgm_src = run['SA1_XTD2_XGM/DOOCS/MAIN:output'] src_grp = xgm_src.files[0].file[f'INSTRUMENT/{xgm_src.source}'] assert src_grp.attrs['source_files'][:].tolist() == [ str(new_run_dir / f'RAW-R0238-DA01-S{i:05}.h5') for i in range(2) ] def test_make_voview_missing_data(mock_fxe_raw_run, tmp_path): run_orig = RunDirectory(mock_fxe_raw_run) voview_file = tmp_path / 'overview.h5' vofw = voview.VirtualOverviewFileWriter(voview_file, run_orig) vofw.write() vf = H5File(voview_file) cam_src = vf['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput'] assert cam_src.aggregator == 'DA01' cam_data = cam_src['data.image.pixels'] assert cam_data.shape[0] == 0 assert cam_data.source_file_paths == [ Path(mock_fxe_raw_run, 'RAW-R0450-DA01-S00000.h5') ] def open_run_with_voview(run_src, new_run_dir): copytree(run_src, new_run_dir) voview_file = new_run_dir / 'overview.h5' run_orig = RunDirectory(str(new_run_dir), _use_voview=False) vofw = voview.VirtualOverviewFileWriter(voview_file, run_orig) vofw.write() opened = RunDirectory(str(new_run_dir)) assert len(opened.files) == 1 return opened def test_combine_voview(mock_spb_raw_run, mock_spb_proc_run, tmp_path): raw_dc = open_run_with_voview(mock_spb_raw_run, tmp_path / 'r0238_raw') proc_dc = open_run_with_voview(mock_spb_proc_run, tmp_path / 'r0238_proc') # Deselect & union data like we do for open_run(..., data='all') raw_extra = raw_dc.deselect([ (src, '*') for src in raw_dc.all_sources & proc_dc.all_sources] ) assert raw_extra.instrument_sources == { 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'SPB_XTD9_XGM/DOOCS/MAIN:output', 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', } run = proc_dc.union(raw_extra) assert 'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf' in run.instrument_sources assert 'SA1_XTD2_XGM/DOOCS/MAIN' in run.control_sources def test_voview_paths(tmp_path, monkeypatch): monkeypatch.setattr(voview, 'DATA_ROOT_DIR', str(tmp_path)) maxwell_run_dir = tmp_path / 'raw' / 'XMPL' / '202102' / 'p700000' / 'r0123' maxwell_run_dir.mkdir(parents=True) voview_file_in_run_m = maxwell_run_dir / 'overview.h5' usr_dir = tmp_path / 'XMPL' / '202102' / 'p700000' / 'usr' usr_dir.mkdir(parents=True) voview_file_in_usr = usr_dir / '.extra_data' / 'RAW-R0123-OVERVIEW.h5' assert voview.voview_paths_for_run(maxwell_run_dir) == [ str(voview_file_in_run_m), str(voview_file_in_usr) ] online_run_dir = tmp_path / 'XMPL' / '202102' / 'p700000' / 'raw' / 'r0123' online_run_dir.mkdir(parents=True) voview_file_in_run_o = online_run_dir / 'overview.h5' assert voview.voview_paths_for_run(online_run_dir) == [ str(voview_file_in_run_o), str(voview_file_in_usr) ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/tests/test_writer.py0000644000175100001660000000453014757376472021544 0ustar00runnerdockerimport h5py import os.path as osp import numpy as np from tempfile import TemporaryDirectory from testpath import assert_isfile from extra_data import RunDirectory, H5File def test_write_selected(mock_fxe_raw_run): with TemporaryDirectory() as td: new_file = osp.join(td, 'test.h5') with RunDirectory(mock_fxe_raw_run) as run: run.select('SPB_XTD9_XGM/*').write(new_file) assert_isfile(new_file) with H5File(new_file) as f: assert f.control_sources == {'SPB_XTD9_XGM/DOOCS/MAIN'} assert f.instrument_sources == {'SPB_XTD9_XGM/DOOCS/MAIN:output'} s = f.get_series('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') # This should have concatenated the two sequence files (400 + 80) assert len(s) == 480 a = f.get_array('SPB_XTD9_XGM/DOOCS/MAIN:output', 'data.intensityTD') assert a.shape == (480, 1000) def test_write_virtual(mock_fxe_raw_run): with TemporaryDirectory() as td: new_file = osp.join(td, 'test.h5') with RunDirectory(mock_fxe_raw_run) as run: run.write_virtual(new_file) assert_isfile(new_file) with h5py.File(new_file, 'r') as f: ds = f['CONTROL/SPB_XTD9_XGM/DOOCS/MAIN/beamPosition/ixPos/value'] assert ds.is_virtual link = f.get('RUN/SPB_XTD9_XGM/DOOCS/MAIN', getlink=True) assert isinstance(link, h5py.ExternalLink) with H5File(new_file) as f: np.testing.assert_array_equal(f.train_ids, np.arange(10000, 10480, dtype=np.uint64)) assert 'SPB_XTD9_XGM/DOOCS/MAIN' in f.control_sources assert 'SPB_XTD9_XGM/DOOCS/MAIN:output' in f.instrument_sources s = f.get_series('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') # This should have concatenated the two sequence files (400 + 80) assert len(s) == 480 a = f.get_array('SPB_XTD9_XGM/DOOCS/MAIN:output', 'data.intensityTD') assert a.shape == (480, 1000) r = f.get_run_value('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') assert isinstance(r, np.float32) cam_nodata = f['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput', 'data.image.pixels'] assert cam_nodata.shape == (0, 255, 1024) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/utils.py0000644000175100001660000000174314757376472017172 0ustar00runnerdocker""" Helpers functions for the euxfel_h5tools package. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ import os from shutil import get_terminal_size def available_cpu_cores(): # This process may be restricted to a subset of the cores on the machine; # sched_getaffinity() tells us which on some Unix flavours (inc Linux) if hasattr(os, "sched_getaffinity"): return len(os.sched_getaffinity(0)) else: # Fallback, inc on Windows ncpu = os.cpu_count() or 2 return min(ncpu, 8) def progress_bar(done, total, suffix=" "): line = f"Progress: {done}/{total}{suffix}[{{}}]" length = min(get_terminal_size().columns - len(line), 50) filled = int(length * done // total) bar = "#" * filled + " " * (length - filled) return line.format(bar) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/validation.py0000644000175100001660000003257314757376472020171 0ustar00runnerdockerfrom argparse import ArgumentParser, Action from multiprocessing import Pool from functools import partial import numpy as np import os import os.path as osp from signal import signal, SIGINT, SIG_IGN import sys from .reader import H5File, FileAccess from .run_files_map import RunFilesMap from .utils import progress_bar class ValidationError(Exception): def __init__(self, problems): self.problems = problems def __str__(self): lines = [] for prob in self.problems: lines.extend(['', prob['msg']]) for k, v in sorted(prob.items()): if k != 'msg': lines.append(" {}: {}".format(k, v)) return '\n'.join(lines) def problem(msg, **kwargs): return dict(msg=msg, **kwargs) class FileValidator: def __init__(self, file: FileAccess, skip_checks=()): self.file = file self.filename = file.filename self.problems = [] self.skip_checks = set(skip_checks) check_funcs = [] def validate(self): problems = self.run_checks() if problems: raise ValidationError(problems) def add_filename(self, prob: dict): prob['file'] = self.filename return prob def run_checks(self): self.problems = [] for func in self.check_funcs: if func.__name__ in self.skip_checks: continue self.problems += [self.add_filename(p) for p in func(self.file)] return self.problems def file_check(f): FileValidator.check_funcs.append(f) return f @file_check def train_ids_nonzero(file): ds_path = 'INDEX/trainId' train_ids = file.file[ds_path][:] if (train_ids == 0).any(): first0 = train_ids.tolist().index(0) if not (train_ids[first0:] == 0).all(): yield problem( 'Zeroes in trainId index before last train ID', dataset=ds_path ) @file_check def train_ids_order(file): ds_path = 'INDEX/trainId' train_ids = file.file[ds_path][:] nonzero_tids = train_ids[train_ids != 0] if len(nonzero_tids) > 1: non_incr = (nonzero_tids[1:] <= nonzero_tids[:-1]).nonzero()[0] if non_incr.size > 0: pos = non_incr[0] yield problem( 'Train IDs are not strictly increasing, e.g. at {} ({} >= {})'.format( pos, nonzero_tids[pos], nonzero_tids[pos + 1] ), dataset=ds_path, ) @file_check def index_control(file): for src in file.control_sources: first, count = file.get_index(src, '') for key in file.get_keys(src): ds_path = f"CONTROL/{src}/{key.replace('.', '/')}" data_dim0 = file.file[ds_path].shape[0] if np.any((first + count) > data_dim0): max_end = (first + count).max() yield problem( 'Index referring to data ({}) outside dataset ({})'.format( max_end, data_dim0 ), dataset=ds_path, ) break # Recording every key separately can make a *lot* of errors yield from _check_index(file, f'INDEX/{src}') @file_check def index_instrument(file): for src in file.instrument_sources: src_groups = set() for key in file.get_keys(src): ds_path = 'INSTRUMENT/{}/{}'.format(src, key.replace('.', '/')) group = key.split('.', 1)[0] src_groups.add((src, group)) first, count = file.get_index(src, group) data_dim0 = file.file[ds_path].shape[0] if np.any((first + count) > data_dim0): max_end = (first + count).max() yield problem( 'Index referring to data ({}) outside dataset ({})'.format( max_end, data_dim0 ), dataset=ds_path, ) for src, group in src_groups: yield from _check_index(file, f'INDEX/{src}/{group}') def _get_index(file, path): """returns first and count dataset for specified source. This is slightly different to the same method in FileAccess as it does cut the dataset up to the trainId's dataset length. """ ix_group = file.file[path] firsts = ix_group['first'][:] if 'count' in ix_group: counts = ix_group['count'][:] else: status = ix_group['status'][:] counts = np.uint64((ix_group['last'][:] - firsts + 1) * status) return firsts, counts def _check_index(file, path): ds_problem = partial(problem, dataset=path) first, count = _get_index(file, path) if (first.ndim != 1) or (count.ndim != 1): yield ds_problem( "Index first / count are not 1D", first_shape=first.shape, count_shape=count.shape, ) return if first.shape != count.shape: yield ds_problem( "Index first & count have different number of entries", first_shape=first.shape, count_shape=count.shape, ) return if first.shape != file.train_ids.shape: yield ds_problem( "Index has wrong number of entries", index_shape=first.shape, trainids_shape=file.train_ids.shape, ) yield from check_index_contiguous(first, count, ds_problem) def check_index_contiguous(firsts, counts, ds_problem): if firsts.size == 0: return # no data in this dataset if firsts[0] != 0: yield ds_problem("Index doesn't start at 0") gaps = firsts[1:].astype(np.int64) - (firsts + counts)[:-1] gap_ixs = (gaps > 0).nonzero()[0] if gap_ixs.size > 0: pos = gap_ixs[0] yield ds_problem("Gaps ({}) in index, e.g. at {} ({} + {} < {})".format( gap_ixs.size, pos, firsts[pos], counts[pos], firsts[pos+1] )) overlap_ixs = (gaps < 0).nonzero()[0] if overlap_ixs.size > 0: pos = overlap_ixs[0] yield ds_problem("Overlaps ({}) in index, e.g. at {} ({} + {} > {})".format( overlap_ixs.size, pos, firsts[pos], counts[pos], firsts[pos + 1] )) @file_check def control_timestamps_order(file): """Check that CONTROL value's timestamps are monotonically increasing. """ for source in file.control_sources: for key in file.get_keys(source): if not key.endswith('.timestamp'): continue ds_path = f'CONTROL/{source}/{key.replace(".", "/")}' ts = file.file[ds_path][:] if (ts == 0).any(): first0 = np.where(ts == 0)[0][0] if not (ts[first0:] == 0).all(): yield problem( 'Zeroes in Timestamp before last train ID', dataset=ds_path ) nonzero_ts = ts[:first0] else: nonzero_ts = ts non_incr = (nonzero_ts[1:] < nonzero_ts[:-1]).nonzero()[0] if non_incr.size > 0: pos = non_incr[0] yield problem( f'Timestamp is decreasing, e.g. at ' f'{pos + 1} ({ts[pos + 1]} < {ts[pos]})', dataset=ds_path, ) def _open_file(filepath): try: fa = FileAccess(filepath) except Exception as e: try: with open(filepath, "rb") as f: f.read(16) except OSError as e2: # Filesystem issue, e.g. dCache node down. HDF5 errors can be # confusing, so record the OS error instead. pb = dict(msg="Could not access file", file=filepath, error=e2) else: # HDF5 file corrupted or missing expected information pb = dict(msg="Could not open HDF5 file", file=filepath, error=e) return None, [pb] else: return fa, [] class RunValidator: def __init__(self, run_dir: str, term_progress=False, skip_checks=()): self.run_dir = run_dir self.term_progress = term_progress self.filenames = [f for f in os.listdir(run_dir) if f.endswith('.h5')] self.file_accesses = [] self.problems = [] self.skip_checks = set(skip_checks) check_funcs = [] def validate(self): problems = self.run_checks() if problems: raise ValidationError(problems) def run_checks(self): self.problems = [] # check_files populates file_accesses as well as running FileValidator self.check_files() for func in self.check_funcs: if func.__name__ in self.skip_checks: continue self.problems += func(self.run_dir, self.file_accesses) return self.problems def progress(self, done, total, nproblems, badfiles): """Show progress information""" if not self.term_progress: return lines = progress_bar(done, total) lines += f'\n{nproblems} problems' if badfiles: lines += f' in {len(badfiles)} files (last: {badfiles[-1]})' if sys.stderr.isatty(): # "\x1b[2K": delete whole line, "\x1b[1A": move up cursor print('\x1b[2K\x1b[1A\x1b[2K', end='\r',file=sys.stderr) print(lines, end='', file=sys.stderr) else: print(lines, file=sys.stderr) def _check_file(self, args): runpath, filename = args filepath = osp.join(runpath, filename) fa, problems = _open_file(filepath) if fa is not None: fv = FileValidator(fa, skip_checks=self.skip_checks) problems.extend(fv.run_checks()) fa.close() return filename, fa, problems def check_files(self): self.file_accesses = [] def initializer(): # prevent child processes from receiving KeyboardInterrupt signal(SIGINT, SIG_IGN) filepaths = [(self.run_dir, fn) for fn in sorted(self.filenames)] nfiles = len(self.filenames) badfiles = [] self.progress(0, nfiles, 0, badfiles) with Pool(initializer=initializer) as pool: iterator = pool.imap_unordered(self._check_file, filepaths) for done, (fname, fa, problems) in enumerate(iterator, start=1): if problems: self.problems.extend(problems) badfiles.append(fname) if fa is not None: self.file_accesses.append(fa) self.progress(done, nfiles, len(self.problems), badfiles) if not self.file_accesses: self.problems.append( dict(msg="No usable files found", directory=self.run_dir) ) def run_dir_check(f): RunValidator.check_funcs.append(f) return f @run_dir_check def run_json_cache(run_dir, file_accesses): # Outdated cache entries we can detect with the file's stat() are not a # problem. Loading the cache file will discard those automatically. cache = RunFilesMap(run_dir) for f_access in file_accesses: f_cache = cache.get(f_access.filename) if f_cache is None: continue if ( f_cache['control_sources'] != f_access.control_sources or f_cache['instrument_sources'] != f_access.instrument_sources or not np.array_equal(f_cache['train_ids'], f_access.train_ids) ): yield dict( msg="Incorrect data map cache entry", cache_file=cache.cache_file, data_file=f_access.filename, ) f_access.close() class ListAction(Action): def __call__(self, parser, namespace, values, option_string=None): print("Available checks:") for func in FileValidator.check_funcs + RunValidator.check_funcs: print(f" {func.__name__}") parser.exit() def main(argv=None): if argv is None: argv = sys.argv[1:] ap = ArgumentParser(prog='extra-data-validate') ap.add_argument('path', help="HDF5 file or run directory of HDF5 files.") ap.add_argument('-l', '--list', action=ListAction, nargs=0, help="List available checks (options for --skip)") ap.add_argument('--skip', action='append', help="Skip a named check (may be used several times)") args = ap.parse_args(argv) available_checks = { f.__name__ for f in FileValidator.check_funcs + RunValidator.check_funcs } bad_skips = set(args.skip or []) - available_checks if bad_skips: print("Unknown names passed to --skip:", ", ".join(sorted(bad_skips))) return 1 path = args.path if os.path.isdir(path): print("Checking run directory:", path) print() validator = RunValidator(path, term_progress=True, skip_checks=args.skip) else: print("Checking file:", path) fa, problems = _open_file(path) if problems: print(str(ValidationError(problems))) return 1 validator = FileValidator(fa, skip_checks=args.skip) try: validator.run_checks() except KeyboardInterrupt: print('\n^C (validation cancelled)') else: print() # Start a new line if validator.problems: print(f"Validation failed! {len(validator.problems)} problems:") print(str(ValidationError(validator.problems))) return 1 else: print("No problems found") if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/voview.py0000644000175100001660000001270514757376472017351 0ustar00runnerdocker"""Create & check 'virtual overview' files These use virtual datasets to present the data from a run as a single file. """ import os import os.path as osp import re import sys from tempfile import TemporaryDirectory import h5py from .file_access import FileAccess from .writer import VirtualFileWriter DATA_ROOT_DIR = "/gpfs/exfel/exp/" # Version number for virtual overview format - increment if we need to stop old # versions of EXtra-data from reading files made by newer versions. VOVIEW_VERSION = 1 class VirtualOverviewFileWriter(VirtualFileWriter): def record_source_files(self): grp = self.file.create_group('.source_files') names, sizes = [], [] for fa in self.data.files: st = fa.metadata_fstat or os.stat(fa.filename) names.append(osp.basename(fa.filename).encode('ascii')) sizes.append(st.st_size) grp.create_dataset( 'names', data=names, dtype=h5py.special_dtype(vlen=bytes) ) grp.create_dataset('sizes', data=sizes, dtype='u8') def write(self): self.record_source_files() self.file.attrs['virtual_overview_version'] = VOVIEW_VERSION super().write() def check_sources(overview_file: h5py.File, run_dir): g = overview_file['.source_files'] if not (g['names'].shape == g['sizes'].shape): return False # Basic check that things make sense files_now = {f for f in os.listdir(run_dir) if f.endswith('.h5') and (f.lower() != 'overview.h5')} files_stored = [p.decode('ascii') for p in g['names'][:]] if files_now != set(files_stored): return False for name, size in zip(files_stored, g['sizes']): st = os.stat(osp.join(run_dir, name)) if st.st_size != size: return False return True def voview_paths_for_run(directory): paths = [osp.join(directory, 'overview.h5')] # After resolving symlinks, data on Maxwell is stored in either # GPFS, e.g. /gpfs/exfel/d/proc/SCS/201901/p002212 or # dCache, e.g. /pnfs/xfel.eu/exfel/archive/XFEL/raw/SCS/201901/p002212 # On the online cluster the resolved path stay: # /gpfs/exfel/exp/inst/cycle/prop/(raw|proc)/run maxwell_match = re.match( # raw/proc instr cycle prop run r'.+/(raw|proc)/(\w+)/(\w+)/(p\d+)/(r\d+)/?$', osp.realpath(directory) ) online_match = re.match( # instr cycle prop raw/proc run r'^.+/(\w+)/(\w+)/(p\d+)/(raw|proc)/(r\d+)/?$', osp.realpath(directory) ) if maxwell_match: raw_proc, instr, cycle, prop, run_nr = maxwell_match.groups() elif online_match: instr, cycle, prop, raw_proc, run_nr = online_match.groups() else: return paths fname = f'{raw_proc.upper()}-{run_nr.upper()}-OVERVIEW.h5' prop_usr = osp.join( DATA_ROOT_DIR, instr, cycle, prop, 'usr' ) if osp.isdir(prop_usr): paths.append( osp.join(prop_usr, '.extra_data', fname) ) return paths def find_file_read(run_dir): for candidate in voview_paths_for_run(run_dir): if osp.isfile(candidate): return candidate def find_file_valid(run_dir): for candidate in voview_paths_for_run(run_dir): if h5py.is_hdf5(candidate): file_acc = FileAccess(candidate) version = file_acc.file.attrs.get('virtual_overview_version', 0) if version <= VOVIEW_VERSION and check_sources(file_acc.file, run_dir): return file_acc def find_file_write(run_dir): for candidate in voview_paths_for_run(run_dir): try: os.makedirs(osp.dirname(candidate), exist_ok=True) candidate_tmp = candidate + '.check' with open(candidate_tmp, 'wb'): pass os.unlink(candidate_tmp) return candidate except PermissionError: pass raise PermissionError def write_atomic(path, data): """Write a virtual overview file, then rename it to the final path This aims to avoid exposing a partially written file where EXtra-data might try to read it. """ dirname, basename = osp.split(path) with TemporaryDirectory(prefix=".create-voview-", dir=dirname) as td: tmp_filename = osp.join(td, basename) try: vofw = VirtualOverviewFileWriter(tmp_filename, data) vofw.write() os.replace(tmp_filename, path) except: os.unlink(tmp_filename) raise def main(argv=None): import argparse ap = argparse.ArgumentParser() ap.add_argument('--check', action='store_true') ap.add_argument('run_dir') ap.add_argument('--overview-file') args = ap.parse_args(argv) if args.check: file_path = args.overview_file or find_file_read(args.run_dir) print(f"Checking {file_path} ...") with h5py.File(file_path, 'r') as f: ok = check_sources(f, args.run_dir) if ok: print("Source files match, overview file can be used") else: print("Source files don't match, overview file outdated") return 1 else: from . import RunDirectory file_path = args.overview_file or find_file_write(args.run_dir) print("Opening", args.run_dir) run = RunDirectory(args.run_dir, _use_voview=False) print(f"Creating {file_path} from {len(run.files)} files...") write_atomic(file_path, run) if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/write_cxi.py0000644000175100001660000004030614757376472020025 0ustar00runnerdocker"""Writing CXI files from AGIPD/LPD data""" import h5py import logging import numpy as np log = logging.getLogger(__name__) class VirtualCXIWriterBase: """ Base class for machinery to write a CXI file containing virtual datasets. You don't normally need to use this class directly. Instead, use the write_virtual_cxi() method on a multi-module detector data interface object. CXI specifies a particular layout of data in the HDF5 file format. It is documented here: http://www.cxidb.org/cxi.html This code writes version 1.5 CXI files. Parameters ---------- detdata: extra_data.components.MultimodDetectorBase The detector data interface for the data to gather in this file. """ # 1 entry is an index along the first (time) dimension in the source files. # XTDF detectors (AGIPD etc.) arrange pulses along this dimension, so each # entry is one frame & one memory cell. JUNGFRAU in burst mode makes one # entry with a separate dimension for several pulses, so overrides this. cells_per_entry = 1 def __init__(self, detdata): self.detdata = detdata self.group_label, self.image_label = detdata._main_data_key.split('.') frame_counts = detdata.frame_counts * self.cells_per_entry self.nframes = frame_counts.sum() log.info("Up to %d frames per train, %d frames in total", frame_counts.max(), self.nframes) self.train_ids_perframe = np.repeat( frame_counts.index.values, frame_counts.values.astype(np.intp) ) # For AGIPD, DSSC & LPD detectors modules are numbered from 0. # Overridden for JUNGFRAU to number from 1. self.modulenos = list(range(self.nmodules)) @property def nmodules(self): """Number of detector modules.""" return self.detdata.n_modules @property def data(self): """DataCollection with detector data from a run.""" return self.detdata.data def _get_module_index(self, module): """Returns an index for the specified module.""" return self.modulenos.index(module) def collect_pulse_ids(self): """ Gather pulse/cell ID labels for all modules and check consistency. Raises ------ Exception: Some data has no pulse ID values for any module. Exception: Inconsistent pulse IDs between detector modules. Returns ------- pulse_ids_min: np.array Array of pulse IDs per frame common for all detector modules. """ # Gather pulse IDs NO_PULSE_ID = 9999 pulse_ids = np.full((self.nframes, self.nmodules), NO_PULSE_ID, dtype=np.uint64) pulse_key = self.group_label + '.' + self.pulse_id_label for source, modno in self.detdata.source_to_modno.items(): module_ix = self._get_module_index(modno) for chunk in self.data._find_data_chunks(source, pulse_key): chunk_data = chunk.dataset self._map_chunk(chunk, chunk_data, pulse_ids, module_ix) # Sanity checks on pulse IDs pulse_ids_min = pulse_ids.min(axis=1) if (pulse_ids_min == NO_PULSE_ID).any(): raise Exception("Failed to find pulse IDs for some data") pulse_ids[pulse_ids == NO_PULSE_ID] = 0 if (pulse_ids_min != pulse_ids.max(axis=1)).any(): raise Exception("Inconsistent pulse IDs for different modules") # Pulse IDs make sense. Drop the modules dimension, giving one # pulse ID for each frame. return pulse_ids_min def _map_chunk(self, chunk, chunk_data, target, tgt_ax1, have_data=None): """ Map data from chunk into target. Chunk points to contiguous source data, but if this misses a train, it might not correspond to a contiguous region in the output. So this may perform multiple mappings. Parameters ---------- chunk: read_machinery::DataChunk Reference to a contiguous chunk of data to be mapped. chunk_data: h5py.Dataset / h5py.VirtualSource Dataset / VirtualSource to map data from. target: np.array / h5py.VirtualLayout Target to map data to. tgt_ax1: int Value for the target axis 1 - index corresponding to the detector module. have_data: np.array(dtype=bool), optional An array to monitor which part of the target have been mapped with data. Defaults to None. """ # Expand the list of train IDs to one per frame for tgt_slice, chunk_slice in self.detdata._split_align_chunk( chunk, self.detdata.train_ids_perframe ): tgt_start = tgt_slice.start * self.cells_per_entry tgt_end = tgt_slice.stop * self.cells_per_entry if self.cells_per_entry == 1: # In some cases, there's an extra dimension of length 1. # E.g. JUNGFRAU data with 1 memory cell per train or # DSSC/LPD raw data. if (len(chunk_data.shape) > 1 and chunk_data.shape[1] == 1): matched = chunk_data[chunk_slice, 0] else: matched = chunk_data[chunk_slice] target[tgt_start:tgt_end, tgt_ax1] = matched else: matched = chunk_data[chunk_slice] if isinstance(chunk_data, h5py.VirtualSource): # Use broadcasting of h5py.VirtualSource target[tgt_start:tgt_end, tgt_ax1] = matched else: target[tgt_start:tgt_end, tgt_ax1] = matched.reshape( (-1,) + matched.shape[2:]) # Fill in the map of what data we have if have_data is not None: have_data[tgt_start:tgt_end, tgt_ax1] = True def _map_layouts(self, layouts): """ Map virtual sources into virtual layouts. Parameters ---------- layouts: dict A dictionary of unmapped virtual layouts. Returns ------- layouts: dict A dictionary of virtual layouts mapped to the virtual sources. """ for name, layout in layouts.items(): key = '{}.{}'.format(self.group_label, name) have_data = np.zeros((self.nframes, self.nmodules), dtype=bool) for source, modno in self.detdata.source_to_modno.items(): print(f" ### Source: {source}, ModNo: {modno}, Key: {key}") module_ix = self._get_module_index(modno) for chunk in self.data._find_data_chunks(source, key): vsrc = h5py.VirtualSource(chunk.dataset) self._map_chunk(chunk, vsrc, layout, module_ix, have_data) filled_pct = 100 * have_data.sum() / have_data.size if hasattr(layout, 'sources'): n_mappings = len(layout.sources) # h5py < 3.3 else: n_mappings = layout.dcpl.get_virtual_count() # h5py >= 3.3 log.info(f"Assembled {n_mappings:d} chunks for {key:s}, " f"filling {filled_pct:.2f}% of the hyperslab") return layouts def write(self, filename, fillvalues=None): """ Write the file on disc to filename. Parameters ---------- filename: str Path of the file to be written. fillvalues: dict, optional Keys are datasets names (one of: data, gain, mask) and associated fill value for missing data. defaults are: - data: nan (proc, float32) or 0 (raw, uint16) - gain: 0 (uint8) - mask: 0xffffffff (uint32) """ pulse_ids = self.collect_pulse_ids() experiment_ids = np.char.add(np.char.add( self.train_ids_perframe.astype(str), ':'), pulse_ids.astype(str)) layouts = self.collect_data() data_label = self.image_label _fillvalues = { # Data can be uint16 (raw) or float32 (proc) data_label: np.nan if layouts[data_label].dtype.kind == 'f' else 0, 'gain': 0, 'mask': 0xffffffff } if fillvalues: _fillvalues.update(fillvalues) # Enforce that fill values are compatible with array dtype _fillvalues[data_label] = layouts[data_label].dtype.type( _fillvalues[data_label]) if 'gain' in layouts: _fillvalues['gain'] = layouts['gain'].dtype.type( _fillvalues['gain']) if 'mask' in layouts: _fillvalues['mask'] = layouts['mask'].dtype.type( _fillvalues['mask']) log.info("Writing to %s", filename) # Virtual datasets require HDF5 >= 1.10. # Specifying this up front should mean it fails before touching # the file if run on an older version. We also specify this as # the maximum version, to ensure we're creating files that can # be read by HDF5 1.10. with h5py.File(filename, 'w', libver=('v110', 'v110')) as f: f.create_dataset('cxi_version', data=[150]) d = f.create_dataset('entry_1/experiment_identifier', shape=experiment_ids.shape, dtype=h5py.special_dtype(vlen=str)) d[:] = experiment_ids # pulseId, trainId, cellId are not part of the CXI standard, # but it allows extra data. f.create_dataset(f'entry_1/{self.pulse_id_label}', data=pulse_ids) f.create_dataset('entry_1/trainId', data=self.train_ids_perframe) cellids = f.create_virtual_dataset('entry_1/cellId', layouts[self.cell_id_label]) cellids.attrs['axes'] = 'experiment_identifier:module_identifier' dgrp = f.create_group('entry_1/instrument_1/detector_1') if len(layouts[data_label].shape) == 4: axes_s = 'experiment_identifier:module_identifier:y:x' else: # 5D dataset, with extra axis for axes_s = 'experiment_identifier:module_identifier:data_gain:y:x' ndg = layouts[data_label].shape[2] d = f.create_dataset('entry_1/data_gain', shape=(ndg,), dtype=h5py.special_dtype(vlen=str)) d[:] = ([data_label, 'gain'] if ndg == 2 else [data_label]) dgrp['data_gain'] = h5py.SoftLink('/entry_1/data_gain') data = dgrp.create_virtual_dataset( 'data', layouts[data_label], fillvalue=_fillvalues[data_label] ) data.attrs['axes'] = axes_s if 'gain' in layouts: gain = dgrp.create_virtual_dataset( 'gain', layouts['gain'], fillvalue=_fillvalues['gain'] ) gain.attrs['axes'] = axes_s if 'mask' in layouts: mask = dgrp.create_virtual_dataset( 'mask', layouts['mask'], fillvalue=_fillvalues['mask'] ) mask.attrs['axes'] = axes_s dgrp['experiment_identifier'] = h5py.SoftLink( '/entry_1/experiment_identifier') f['entry_1/data_1'] = h5py.SoftLink( '/entry_1/instrument_1/detector_1') dgrp.create_dataset('module_identifier', data=self.modulenos) log.info("Finished writing virtual CXI file") class XtdfCXIWriter(VirtualCXIWriterBase): """ Machinery to write VDS files for a group of detectors with similar data format - AGIPD, DSSC & LPD. You don't normally need to use this class directly. Instead, use the write_virtual_cxi() method on a multi-module detector data interface object. CXI specifies a particular layout of data in the HDF5 file format. It is documented here: http://www.cxidb.org/cxi.html This code writes version 1.5 CXI files. Parameters ---------- detdata: extra_data.components.XtdfDetectorBase The detector data interface for the data to gather in this file. """ def __init__(self, detdata) -> None: self.cells_per_entry = 1 self.pulse_id_label = 'pulseId' self.cell_id_label = 'cellId' super().__init__(detdata) def collect_data(self): """ Prepare virtual layouts and map them to the virtual sources in the data chunks. Returns ------- layouts: dict A dictionary mapping virtual datasets names (e.g. ``data``) to h5py virtual layouts. """ src = next(iter(self.detdata.source_to_modno)) h5file = self.data[src].files[0].file image_grp = h5file['INSTRUMENT'][src][self.group_label] VLayout = h5py.VirtualLayout det_name = type(self.detdata).__name__ if 'gain' in image_grp: log.info(f"Identified {det_name} calibrated data") shape = (self.nframes, self.nmodules) + self.detdata.module_shape log.info("Virtual data shape: %r", shape) layouts = { self.image_label: VLayout( shape, dtype=image_grp[self.image_label].dtype), 'gain': VLayout(shape, dtype=image_grp['gain'].dtype), } if 'mask' in image_grp: layouts['mask'] = VLayout(shape, dtype=image_grp['mask'].dtype) else: log.info(f"Identified {det_name} raw data") shape = (self.nframes, self.nmodules) + image_grp['data'].shape[1:] log.info("Virtual data shape: %r", shape) layouts = { self.image_label: VLayout( shape, dtype=image_grp[self.image_label].dtype), } layouts[self.cell_id_label] = VLayout( (self.nframes, self.nmodules), dtype=image_grp[self.cell_id_label].dtype ) return self._map_layouts(layouts) class JUNGFRAUCXIWriter(VirtualCXIWriterBase): """ Machinery to write VDS files for JUNGFRAU data in the same format as AGIPD/LPD virtual datasets. You don't normally need to use this class directly. Instead, use the write_virtual_cxi() method on a multi-module detector data interface object. CXI specifies a particular layout of data in the HDF5 file format. It is documented here: http://www.cxidb.org/cxi.html This code writes version 1.5 CXI files. Parameters ---------- detdata: extra_data.components.JUNGFRAU The detector data interface for the data to gather in this file. """ def __init__(self, detdata) -> None: # Check number of cells src = next(iter(detdata.source_to_modno)) keydata = detdata.data[src, 'data.adc'] self.cells_per_entry = keydata.entry_shape[0] self.pulse_id_label = 'memoryCell' self.cell_id_label = 'memoryCell' super().__init__(detdata) # For JUNGFRAU detectors modules are numbered from 1 self.modulenos = list(range(1, self.nmodules + 1)) def collect_data(self): """ Prepare virtual layouts and map them to the virtual sources in the data chunks. Returns ------- layouts: dict A dictionary mapping virtual datasets names (e.g. ``data``) to h5py virtual layouts. """ src = next(iter(self.detdata.source_to_modno)) h5file = self.data[src].files[0].file image_grp = h5file['INSTRUMENT'][src][self.group_label] VLayout = h5py.VirtualLayout det_name = type(self.detdata).__name__ log.info(f"Identified {det_name} data") shape = (self.nframes, self.nmodules) + self.detdata.module_shape log.info("Virtual data shape: %r", shape) layouts = { self.image_label: VLayout( shape, dtype=image_grp[self.image_label].dtype), 'gain': VLayout(shape, dtype=image_grp['gain'].dtype), self.cell_id_label: VLayout( (self.nframes, self.nmodules), dtype=image_grp[self.cell_id_label].dtype ), } if 'mask' in image_grp: layouts['mask'] = VLayout(shape, dtype=image_grp['mask'].dtype) return self._map_layouts(layouts) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/extra_data/writer.py0000644000175100001660000002527014757376472017347 0ustar00runnerdockerimport h5py import numpy as np from packaging import version from .exceptions import MultiRunError class FileWriter: """Write data in European XFEL HDF5 format This is intended to allow copying a subset of data into a smaller, more portable file. """ def __init__(self, path, data): self.file = h5py.File(path, 'w') self.data = data self.indexes = {} # {path: (first, count)} self.data_sources = set() def prepare_source(self, source): """Prepare all the datasets for one source. We do this as a separate step so the contents of the file are defined together before the main data. """ for key in sorted(self.data.keys_for_source(source)): path = f"{self._section(source)}/{source}/{key.replace('.', '/')}" nentries = self._guess_number_of_storing_entries(source, key) src_ds1 = self.data[source].files[0].file[path] self.file.create_dataset_like( path, src_ds1, shape=(nentries,) + src_ds1.shape[1:], # Corrected detector data has maxshape==shape, but if any max # dim is smaller than the chunk size, h5py complains. Making # the first dimension unlimited avoids this. maxshape=(None,) + src_ds1.shape[1:], ) if source in self.data.instrument_sources: self.data_sources.add(f"INSTRUMENT/{source}/{key.partition('.')[0]}") if source not in self.data.instrument_sources: self.data_sources.add(f"CONTROL/{source}") def _guess_number_of_storing_entries(self, source, key): """Provide the length for the initial dataset to create. May be overridden in subclasses. """ return self.data.get_data_counts(source, key).sum() def _section(self, source): if source in self.data.instrument_sources: return 'INSTRUMENT' else: return 'CONTROL' def copy_dataset(self, source, key): """Copy data into a dataset""" a = self.data.get_array(source, key) path = f"{self._section(source)}/{source}/{key.replace('.', '/')}" self.file[path][:] = a.values self._make_index(source, key, a.coords['trainId'].values) def _make_index(self, source, key, data_tids): # Original files contain exactly 1 entry per train for control data, # but if one file starts before another, there can be some values # missing when we collect several files together. We don't try to # extrapolate to fill missing data, so some counts may be 0. if source in self.data.instrument_sources: index_path = source + '/' + key.partition('.')[0] else: index_path = source if index_path not in self.indexes: if source not in self.data.instrument_sources: assert len(np.unique(data_tids)) == len(data_tids),\ "Duplicate train IDs in control data!" self.indexes[index_path] = self._generate_index(data_tids) def _generate_index(self, data_tids): """Convert an array of train IDs to first/count for each train""" assert (np.diff(data_tids) >= 0).all(), "Out-of-order train IDs" counts = np.array([np.count_nonzero(t == data_tids) for t in self.data.train_ids], dtype=np.uint64) firsts = np.zeros_like(counts) firsts[1:] = np.cumsum(counts)[:-1] # firsts[0] is always 0 return firsts, counts def copy_source(self, source): """Copy data for all keys of one source""" for key in self.data.keys_for_source(source): self.copy_dataset(source, key) def write_train_ids(self): self.file.create_dataset( 'INDEX/trainId', data=self.data.train_ids, dtype='u8' ) train_timestamps = self.data.train_timestamps() if not np.all(np.isnat(train_timestamps)): self.file.create_dataset( 'INDEX/timestamp', data=train_timestamps.astype(np.uint64) ) def write_indexes(self): """Write the INDEX information for all data we've copied""" for groupname, (first, count) in self.indexes.items(): group = self.file.create_group(f'INDEX/{groupname}') group.create_dataset('first', data=first, dtype=np.uint64) group.create_dataset('count', data=count, dtype=np.uint64) def write_metadata(self): try: metadata = self.data.run_metadata() except MultiRunError: metadata = {} metadata_grp = self.file.create_group('METADATA') format_version = version.parse(metadata.get('dataFormatVersion')) if format_version >= version.parse("1.0"): # We don't care about the differences between version 1.0/1.1/1.2, # so for simplicity we stick to the 1.0 format. metadata["dataFormatVersion"] = "1.0" self.write_sources(metadata_grp.create_group('dataSources')) # File format 1.0 should also have INDEX/flag self.file.create_dataset('INDEX/flag', data=self.gather_flag()) for key, val in metadata.items(): metadata_grp[key] = [val] else: # File format '0.5': source lists directly in METADATA self.write_sources(metadata_grp) def write_sources(self, data_sources_grp: h5py.Group): """Write the METADATA section, including lists of sources""" vlen_bytes = h5py.special_dtype(vlen=bytes) data_sources = sorted(self.data_sources) N = len(data_sources) sources_ds = data_sources_grp.create_dataset( 'dataSourceId', (N,), dtype=vlen_bytes, maxshape=(None,) ) sources_ds[:] = data_sources root_ds = data_sources_grp.create_dataset( 'root', (N,), dtype=vlen_bytes, maxshape=(None,) ) root_ds[:] = [ds.split('/', 1)[0] for ds in data_sources] devices_ds = data_sources_grp.create_dataset( 'deviceId', (N,), dtype=vlen_bytes, maxshape=(None,) ) devices_ds[:] = [ds.split('/', 1)[1] for ds in data_sources] def gather_flag(self): """Make the array for INDEX/flag. Trains are valid (1) if they are valid in *any* of the source files. """ tid_arr = np.asarray(self.data.train_ids, dtype=np.uint64) flag = np.zeros_like(tid_arr, dtype=np.int32) for fa in self.data.files: mask_valid = np.isin(tid_arr, fa.valid_train_ids) flag[mask_valid] = 1 return flag def set_writer(self): """Record the package & version writing the file in an attribute""" from . import __version__ self.file.attrs['writer'] = 'extra_data {}'.format(__version__) def write(self): d = self.data self.set_writer() self.write_train_ids() for source in d.all_sources: self.prepare_source(source) self.write_metadata() for source in d.all_sources: self.copy_source(source) self.write_indexes() class VirtualFileWriter(FileWriter): """Write virtual datasets in European XFEL format The new files refer to the original data files, so they aren't portable, but they provide more convenient access by reassembling data spread over several sequence files. """ def __init__(self, path, data): if not hasattr(h5py, 'VirtualLayout'): raise Exception("Creating virtual datasets requires HDF5 1.10 " "and h5py 2.9") super().__init__(path, data) def _assemble_data(self, keydata): """Assemble chunks of data into a virtual layout""" # Create the layout, which will describe what data is where layout = h5py.VirtualLayout(shape=keydata.shape, dtype=keydata.dtype) # Map each chunk into the relevant part of the layout output_cursor = 0 for chunk in keydata._data_chunks_nonempty: n = chunk.total_count src = h5py.VirtualSource(chunk.dataset) layout[output_cursor : output_cursor + n] = src[chunk.slice] output_cursor += n assert output_cursor == layout.shape[0] return layout # In big detector data, these fields are like extra indexes. # So we'll copy them to the output file for fast access, rather than # making virtual datasets. copy_keys = {'image.pulseId', 'image.cellId'} def prepare_source(self, source): srcdata = self.data[source] grp_out = self.file.require_group(f'{srcdata.section}/{source}') grp_out.attrs['source_files'] = sorted([f.filename for f in srcdata.files]) for key in srcdata.keys(): if key in self.copy_keys: self.copy_dataset(source, key) else: self.add_dataset(source, key) # Add a link in RUN for control sources if srcdata.is_control: src_file = srcdata.files[0] run_path = f'RUN/{source}' self.file[run_path] = h5py.ExternalLink(src_file.filename, run_path) def copy_dataset(self, source, key): """Copy data as a new dataset""" a = self.data.get_array(source, key) path = f"{self._section(source)}/{source}/{key.replace('.', '/')}" self.file.create_dataset(path, data=a.values, compression='gzip') self._make_index(source, key, a.coords['trainId'].values) def add_dataset(self, source, key): keydata = self.data[source, key] if keydata.shape[0] == 0: # No data # Make the dataset virtual even with no source data to map. # This workaround will hopefully become unnecessary from h5py 3.14 parent_path, name = keydata.hdf5_data_path.rsplit('/', 1) group = self.file.require_group(parent_path) dcpl = h5py.h5p.create(h5py.h5p.DATASET_CREATE) dcpl.set_layout(h5py.h5d.VIRTUAL) h5py.h5d.create( group.id, name=name.encode(), tid=h5py.h5t.py_create(keydata.dtype, logical=1), space=h5py.h5s.create_simple(keydata.shape), dcpl=dcpl ) else: layout = self._assemble_data(keydata) self.file.create_virtual_dataset(keydata.hdf5_data_path, layout) self._make_index(source, key, keydata.train_id_coordinates()) if source in self.data.instrument_sources: self.data_sources.add(f"INSTRUMENT/{source}/{key.partition('.')[0]}") else: self.data_sources.add(f"CONTROL/{source}") return keydata.hdf5_data_path def copy_source(self, source): pass # Override base class copying data ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/pytest.ini0000644000175100001660000000040414757376472015366 0ustar00runnerdocker[pytest] addopts = --ignore docs/xpd_examples.ipynb --ignore docs/xpd_examples2.ipynb --ignore docs/parallel_example.ipynb --ignore docs/dask_averaging.ipynb --ignore docs/inspection.ipynb --ignore docs/iterate_trains.ipynb --ignore docs/aligning_trains.ipynb ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1740504381.8817263 extra_data-1.20.0/setup.cfg0000644000175100001660000000004614757376476015164 0ustar00runnerdocker[egg_info] tag_build = tag_date = 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1740504378.0 extra_data-1.20.0/setup.py0000755000175100001660000000637314757376472015065 0ustar00runnerdocker#!/usr/bin/env python import os.path as osp import re from setuptools import setup, find_packages import sys def get_script_path(): return osp.dirname(osp.realpath(sys.argv[0])) def read(*parts): return open(osp.join(get_script_path(), *parts)).read() def find_version(*parts): vers_file = read(*parts) match = re.search(r'^__version__ = "(\d+\.\d+\.\d+)"', vers_file, re.M) if match is not None: return match.group(1) raise RuntimeError("Unable to find version string.") setup(name="EXtra-data", version=find_version("extra_data", "__init__.py"), author="European XFEL GmbH", author_email="da-support@xfel.eu", maintainer="Thomas Michelat", project_urls={ 'Documentation': 'https://extra-data.readthedocs.io/en/latest/', 'Release notes': 'https://extra-data.readthedocs.io/en/latest/changelog.html', 'Issues': 'https://github.com/European-XFEL/EXtra-data/issues', 'Source': 'https://github.com/European-XFEL/EXtra-data', }, description="Tools to read and analyse data from European XFEL ", long_description=read("README.md"), long_description_content_type='text/markdown', license="BSD-3-Clause", packages=find_packages(), package_data={ 'extra_data.tests': ['dssc_geo_june19.h5', 'lpd_mar_18.h5'], }, entry_points={ "console_scripts": [ "lsxfel = extra_data.lsxfel:main", "karabo-bridge-serve-files = extra_data.cli.serve_files:main", "karabo-bridge-serve-run = extra_data.cli.serve_run:main", "extra-data-validate = extra_data.validation:main", "extra-data-make-virtual-cxi = extra_data.cli.make_virtual_cxi:main", "extra-data-locality = extra_data.locality:main", ], }, install_requires=[ 'h5py>=2.10', 'matplotlib', 'numpy', 'packaging', 'pandas', 'xarray', 'pyyaml', ], extras_require={ 'bridge': [ 'karabo-bridge >=0.6', 'psutil', ], 'complete': [ 'dask[array]', 'extra_data[bridge]', 'tomli; python_version < "3.11"', ], 'docs': [ 'extra_data[bridge]', # For autodoc of ZMQStreamer 'ipython', # For nbsphinx syntax highlighting 'nbsphinx', 'sphinx', 'sphinxcontrib_github_alt', ], 'test': [ 'cloudpickle', 'coverage', 'extra_data[complete]', 'nbval', 'pytest', 'pytest-cov', 'testpath', ] }, python_requires='>=3.10', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: BSD License', 'Operating System :: POSIX :: Linux', 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Scientific/Engineering :: Physics', ] )