Source code for fastr.datatypes

# Copyright 2011-2014 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The datatypes module holds all DataTypes generated by fastr and all the base
classes for these datatypes.
"""

# Empty module to be populated by the fastr.core.datatypemanager.DataTypeManager
from abc import abstractmethod, abstractproperty
import os
import sys
import traceback
import urllib
import urlparse

from fastr import exceptions as exceptions
import fastr
from fastr.core.baseplugin import BasePlugin, PluginState
from fastr.core.version import Version
from fastr.data import url
from fastr.utils.checksum import md5_checksum, hashsum
from fastr.utils.classproperty import classproperty
from fastr.utils import iohelpers


[docs]class BaseDataType(BasePlugin): """ The base class for all datatypes in the fastr type system. """ filename = __file__ #: Version of the DataType definition version = Version('1.0') #: Description of the DataType description = '' #: Extension related to the Type extension = None # DataTypes do not need to be loaded, so they are always Loaded _status = (PluginState.loaded, 'DataTypes are always loaded', '') @abstractmethod
[docs] def __init__(self, value=None, format_=None): """ The BaseDataType constructor. :param value: value to assign to the new BaseDataType object :param format_: the format used for the ValueType :return: new BaseDataType object :raises FastrDataTypeNotInstantiableError: if not subclassed :raises FastrNotImplementedError: if *id*, *name*, *version* or *description* is None """ super(BaseDataType, self).__init__() if self.__class__ is BaseDataType: raise exceptions.FastrDataTypeNotInstantiableError('BaseDataType not instantiable') # Check if id, name and version are set properly if self.id is None or self.name is None or self.version is None or self.description is None: raise exceptions.FastrNotImplementedError(('Subclasses of BaseDataType need to' ' have id ({}), name ({}), version ({})' ' and description ({}) fields set!').format(self.id, self.name, self.version, self.description)) self._value = None self.value = value self.format = format_ if not self.valid: fastr.log.debug("'{}' is not a valid {}".format(self._value, self.id))
[docs] def __repr__(self): """ Returns string representation of the BaseDataType :return: string represenation :rtype: str """ return "<{}: {}>".format(self.id, repr(self.value))
[docs] def __str__(self): """ Returns the string version of the BaseDataType :return: string version :rtype: str """ return str(self.value)
[docs] def __eq__(self, other): """ Test the equality of two DataType objects :parameter DataType other: the object to compare against :return: flag indicating equality :rtype: bool """ if type(self) is not type(other): return NotImplemented return self.value == other.value
[docs] def __ne__(self, other): """ Test if two objects are not equal. This is by default done by negating the __eq__ operator :parameter DataType other: the object to compare against :return: flag indicating equality :rtype: bool """ return not self.__eq__(other)
[docs] def __getstate__(self): return (type(self).id, self.value, self.format)
[docs] def __setstate__(self, state): if self.id != state[0]: raise exceptions.FastrValueError('Unvalid state for {}, state is for type {}'.format(self.id, state[0])) self.value = state[1] self.format = state[2]
@classproperty def id(cls): """ Internal id used for the DataType """ return cls.__name__ @classproperty def fullid(cls): """ The full fastr id of the DataType """ return '{}/{}'.format(cls.parent.fullid, cls.id) @classproperty def name(cls): """ Display friendly name of the DataType """ return cls.id @property def value(self): """ The value of object instantiation of this DataType. """ return self._value @property def raw_value(self): """ The raw value of object instantiation of this DataType. For datatypes that override value (like Deferred) this is the way to access the _value field. """ result = self._value if isinstance(result, basestring): if result.startswith('val://'): result = result[6:] result = urllib.quote_plus(result) return result @property def data_uri(self): if url.isurl(self._value): return self._value else: return 'fastr://data/constant/{}'.format(self._value) @value.setter def value(self, value): """ Setter function for value property """ if isinstance(value, type(self)): self._value = value.value self.format = value.format else: self._value = value @classproperty def parent(self): """ The parent container of the DataType """ return fastr.typelist @property def parsed_value(self): """ The parsed value of object instantiation of this DataType. """ return self._value @classmethod
[docs] def test(cls): """ Define the test for the BasePluginManager. Make sure we are not one of the base classes """ if cls in [BaseDataType, DataType, TypeGroup, EnumType, ValueType, URLType]: raise exceptions.FastrTypeError('Cannot use a basic datatype {} as a plugin!'.format(cls))
@property def valid(self): """ A boolean flag that indicates weather or not the value assigned to this DataType is valid. This property is generally overwritten by implementation of specific DataTypes. """ # Avoid errors in the validation testing (this is external code) # so we use a broad except on purpose # pylint: disable=broad-except try: return self._validate() except Exception: exc_type, _, _ = sys.exc_info() exc_info = traceback.format_exc() fastr.log.warning('Could not validate {}: encountered exception ({}) during execution:\n{}'.format(repr(self), exc_type.__name__, exc_info)) raise return False def _validate(self): """ The actual validation function to be overwritten by subclasses. """ # This function doesn't use self, but is intended to be potentially # overwritten by the subclasses. # pyline: disbale=no-self-use return True
[docs] def checksum(self): """ Generate a checksum for the value of this DataType :return: the checksum of the value :rtype: str """ return hashsum(self.value)
@classmethod
[docs] def isinstance(cls, value): """ Indicate whether value is an instance for this DataType. :return: the flag indicating the value is of this DataType :rtype: bool """ return isinstance(value, cls)
[docs]class DataType(BaseDataType): """ This class is the base class for all DataTypes that can hold a value. """ @abstractmethod
[docs] def __init__(self, value=None, format_=None): """ The DataType constructor. :param value: value to assign to the new DataType object :param format: the format used for the ValueType :return: new DataType object """ super(DataType, self).__init__(value, format_)
[docs] def action(self, name): """ This function can be overwritten by subclasses to implement certain action that should be performed. For example, the *Directory* DataType has an action *ensure*. This method makes sure the Directory exists. A Tool can indicate an action that should be called for an Output which will be called before execution. :param str name: name of the action to execute :return: None """ if name is not None: fastr.log.warning("unknown action '{}' for DataType {}".format(name, self.id))
[docs]class TypeGroup(BaseDataType): """ The TypeGroup is a special DataType that does not hold a value of its own but is used to group a number of DataTypes. For example ITK has a list of supported file formats that all tools build on ITK support. A group can be used to conveniently specify this in multiple Tools that use the same set DataTypes. """
[docs] def __new__(cls, value=None, format_=None): """ Instantiate a TypeGroup. This will for match the value to the best matching type and instantiate that. Not that the returned object will not be of type TypeGroup but one of the TypeGroup members. """ # Avoid casting values that are already in a member type if any(isinstance(value, x) for x in cls.members): return value matching_type = fastr.typelist.guess_type(value, options=cls) # Only continue if we have value urls if isinstance(value, (str, unicode)) and value.startswith('val://'): if matching_type is None: # Just try a simple match rather than guessing matching_type = fastr.typelist.match_types(cls, type(value)) if matching_type is None: # Just try a simple match rather than guessing matching_type = fastr.typelist.match_types(cls) if matching_type is None: raise exceptions.FastrValueError('Cannot matching value {} [{}] to any of {}'.format(value, type(value).__name__, cls.members)) return matching_type(value, format_)
[docs] def __init__(self, value=None): """ Dummy constructor. TypeGroups are not instantiable and cannot hold a value of its own. :raises FastrDataTypeNotInstantiableError: if called """ # All type groups are per definition not instantiable # pylint: disable=super-init-not-called raise exceptions.FastrDataTypeNotInstantiableError('TypeGroups are not instantiable')
@abstractproperty def _members(self): """ The id of the members, this should be set in the subclass. It should be a frozen set of str. """ _members = frozenset() @classproperty def members(cls): """ The members of the TypeGroup """ members = frozenset(fastr.typelist[x] for x in cls._members if x in fastr.typelist) members = [x.members if issubclass(x, TypeGroup) else (x,) for x in members] return frozenset(x for y in members for x in y) @classmethod
[docs] def isinstance(cls, value): return any(x.isinstance(value) for x in cls.members)
[docs]class AnyType(TypeGroup): """ Special Datatype in fastr that is a TypeGroup with all known DataTypes as its members. """ @classproperty def _members(cls): """ A "class-poperty" that gives a list of the ids of all currently loaded DataTypes """ return frozenset(x.id for x in fastr.typelist.values() if issubclass(x, DataType)) @classproperty def description(cls): """ The description of the AnyType, including the list of member types. """ disp_members = [' - {}'.format(member) for member in cls.members] return """ TypeGroup {id} {name} ({id}) is a group of consisting of all DataTypes known by fastr, currently: {members} """.strip().format(id='AnyType', name='AnyType', members='\n'.join(disp_members))
[docs]class AnyFile(TypeGroup): """ Special Datatype in fastr that is a TypeGroup with all known DataTypes as its members. """ @classproperty def _members(cls): """ A "class-poperty" that gives a list of the ids of all currently loaded DataTypes """ return frozenset(x.id for x in fastr.typelist.values() if issubclass(x, URLType)) @classproperty def description(cls): """ The description of the AnyType, including the list of member types. """ disp_members = [' - {}'.format(member) for member in cls.members] return """ TypeGroup {id} {name} ({id}) is a group of consisting of all URLTypes known by fastr, currently: {members} """.strip().format(id='AnyFile', name='AnyFile', members='\n'.join(disp_members))
[docs]class EnumType(DataType): """ The EnumType is the base for DataTypes that can have a value which is an option from a predefined set of possibilities (similar to an enum type in many programming languages). """ #: Enums always have version 1.0 version = Version('1.0') _options = frozenset()
[docs] def __init__(self, value=None, format_=None): """ The EnumType constructor. :param value: value to assign to the new EnumType object :param format: the format used for the ValueType :return: new EnumType object :raises FastrDataTypeNotInstantiableError: if not subclassed """ super(EnumType, self).__init__(value, format_) if self.__class__ is EnumType: raise exceptions.FastrDataTypeNotInstantiableError('EnumType is not instantiable')
@classproperty def description(cls): """ The description of the AnyType, including the list of member types. """ disp_options = [' - "{}"'.format(option) for option in cls.options] return """ {name} ({id}) is a enumerate type with options: {options} {name} can take the value of any of the option, but any other value is considered invalid. """.strip().format(id=cls.id, name=cls.name, options='\n'.join(disp_options)) @classproperty def options(cls): """ A frozenset holding the options that the value of the EnumType object can have. :return: the options the value can hold :rtype: frozenset """ return cls._options def _validate(self): return self._value in self._options
[docs]class ValueType(DataType): """ The ValueType is the base for DataTypes that hold simple values (not an EnumType and not a file/URL). The values is generally represented by a string. """
[docs] def __init__(self, value=None, format_=None): """ The ValueType constructor :param value: value to assign to the new ValueType :param format: the format used for the ValueType :return: new ValueType object """ super(ValueType, self).__init__(value, format_)
[docs]class URLType(DataType): """ The URLType is the base for DataTypes that point to a resource somewhere else (typically a filesystem). The true value is actually the resource referenced by the value in this object. """
[docs] def __init__(self, value=None, format_=None): """ The URLType constructor :param value: value to assign to the new URLType :param format: the format used for the ValueType :return: new URLType object """ super(URLType, self).__init__(value, format_)
[docs] def __eq__(self, other): """ Test the equality of two DataType objects :parameter URLType other: the object to compare against :return: flag indicating equality :rtype: bool """ if type(self) is not type(other): return NotImplemented return self.checksum() == other.checksum()
[docs] def checksum(self): """ Return the checksum of this URL type :return: checksum string :rtype: str """ contents = self.content(self.parsed_value) return md5_checksum(contents)
@property def parsed_value(self): """ The parsed value of object instantiation of this DataType. """ if url.isurl(self.value): parsed_url = urlparse.urlparse(self.value) if parsed_url.scheme == 'val': datafile = os.path.join(fastr.config.mounts[parsed_url.netloc], os.path.normpath(parsed_url.path[1:])) query = urlparse.parse_qs(parsed_url.query) # Open Job file data = iohelpers.load_gpickle(datafile) # Attempt to extract data try: outputname = query['outputname'][0] cardinality_nr = int(query['nr'][0]) if 'sampleid' in query: sample_id = query['sampleid'][0] value = data.output_data[outputname][sample_id][cardinality_nr] else: value = data.output_data[outputname][cardinality_nr] except (IndexError, KeyError) as exception: fastr.log.debug('Output data for query: {}'.format(data.output_data)) message = 'Could not get value from {}, encountered {}: {}'.format(self.value, type(exception).__name__, exception.message) raise exceptions.FastrKeyError(message) if value.startswith('vfs://'): return fastr.vfs.url_to_path(value) else: return value elif parsed_url.scheme == 'vfs': return fastr.vfs.url_to_path(self.value) else: raise exceptions.FastrValueError('Cannot get parsed value for (non-vfs, non-val) url: {} (scheme {})'.format(self.value, parsed_url.scheme)) else: return self.value @classmethod
[docs] def content(cls, inval, outval=None): """ Give the contents of a URLType, this is generally useful for filetypes that consists of multiple files (e.g. AnalyzeImageFile, DICOM). The value will indicate the main file, and the contents function can determine all files that form a single data value. :param inval: a value to figure out contents for this type :param outval: the place where the copy should point to :return: a list of all files part of the value (e.g. header and data file) :rtype: list """ if outval is not None: return [(inval, outval)] else: return [inval]
@property def valid(self): """ A boolean flag that indicates weather or not the value assigned to this DataType is valid. This property is generally overwritten by implementation of specific DataTypes. """ if not isinstance(self.value, str): return False return super(URLType, self).valid def _validate(self): """ The actual validation function to be overwritten by subclasses. """ try: contents = self.content(self.parsed_value) for content in contents: if not os.path.exists(content): return False return True except (TypeError, ValueError, IOError): return False
[docs]class Deferred(DataType):
[docs] def __init__(self, value=None, format_=None): """ The Deferred constructor. :param value: value to assign to the new DataType object :param format: This is ignore but here for compatibility :return: new Deferred object """ self._value = value self._data = None
[docs] def __repr__(self): """ Returns string representation of the BaseDataType :return: string represenation :rtype: str """ if self.target is None: return "<{}: {}>".format(self.id, repr(self._value)) else: return "<{}: {}>".format(self.id, repr(self.target))
[docs] def __getstate__(self): return ('Deferred', self._value)
[docs] def __setstate__(self, state): if self.id != state[0]: raise exceptions.FastrValueError('Unvalid state for {}, state is for type {}'.format(self.id, state[0])) self._value = state[1]
@classmethod
[docs] def lookup(cls, value): """ Look up the deferred target and return that object :param: value :return: The value the deferred points to :rtype: DataType :raises FastrKeyError: if the deferred is not available (yet) :raises FastrValueError: if the value is not a valid deferrred url """ parsed_url = urlparse.urlparse(value) if parsed_url.scheme == 'val': datafile = os.path.join(fastr.config.mounts[parsed_url.netloc], os.path.normpath(parsed_url.path[1:])) query = urlparse.parse_qs(parsed_url.query) # Open Job file data = iohelpers.load_gpickle(datafile) # Get provenace provenance = data.provenance # Attempt to extract data try: outputname = query['outputname'][0] cardinality_nr = int(query['nr'][0]) if 'sampleid' in query: sample_id = query['sampleid'][0] value = data.output_data[outputname][sample_id][cardinality_nr] else: value = data.output_data[outputname][cardinality_nr] except (IndexError, KeyError) as exception: fastr.log.debug('Output data for query: {}'.format(data.output_data)) message = 'Could not get value from {}, encountered {}: {}'.format(value, type(exception).__name__, exception.message) raise exceptions.FastrKeyError(message) else: raise exceptions.FastrValueError('Cannot lookup value {}, wrong url scheme'.format(value)) if isinstance(value, Deferred): value, provenance, data = value._get_data() return value, provenance, data
def _get_data(self): """ Get and cache the data (target / provenance) of the target. :return: tuple containing (target, provenance) :rtype: tuple """ if self._data is None: self._data = self.lookup(self._value) return self._data @property def target(self): """ Target object for this deferred. :raises FastrKeyError: if the deferred is not available (yet) :raises FastrValueError: if the value is not a valid deferrred url """ return self._get_data()[0] @property def value(self): """ The value of object instantiation of this DataType. """ try: target = self.target except exceptions.FastrKeyError: return None return target.value @property def provenance(self): return self._get_data()[1] @property def job(self): try: return self._get_data()[2] except exceptions.FastrValueError: return None @property def data_uri(self): if self._value.startswith('val://'): val_uri = urlparse.urlparse(self._value) val_query = urlparse.parse_qs(val_uri.query) sample_id = val_query['sampleid'][0] if 'sampleid' in val_query else self.job.sample_id return 'fastr://data/job/{jobid}/output/{outputid}/sample/{sample}/cardinality/{cardinality}'.format( jobid=self.job.jobid, outputid=val_query['outputname'][0], sample=sample_id, cardinality=val_query['nr'][0] ) else: return super(Deferred, self).data_uri def _validate(self): """ The actual validation function to be overwritten by subclasses. """ # This function doesn't use self, but is intended to be potentially # overwritten by the subclasses. # pyline: disbale=no-self-use try: target = self.target except exceptions.FastrKeyError: return False return target.valid
[docs] def checksum(self): """ Generate a checksum for the value of this DataType :return: the checksum of the value :rtype: str """ try: target = self.target except exceptions.FastrKeyError: return hashsum('__FASTR_NOT_AVAILABLE_HASH__') return target.checksum()
[docs]def fastr_isinstance(obj, datatype): """ Check if an object is of a specific datatype. :param obj: Object to inspect :param datatype: The datatype(s) to check :type datatype: tuple, BaseDataType :return: flag indicating object is of datatype :rtype: bool """ if not isinstance(datatype, tuple): datatype = datatype, for dtype in datatype: if issubclass(dtype, TypeGroup): if type(obj) in dtype.members: return True elif isinstance(obj, datatype): return True return False