Source code for altamisa.isatab.validate_investigation

# -*- coding: utf-8 -*-
"""Validation of an ISA investigation

Eventually, all format independent content- and specification-related validations which
don't interrupt model creation definitely (e.g. when parsing from ISA-tab) should go
here. Then, validations can be performed on whole models (e.g. after parsing or before
writing) and provide a comprehensive list of warnings of different degree.
"""

import re
from typing import Dict, Tuple
import warnings

from ..exceptions import (
    AdvisoryIsaValidationWarning,
    CriticalIsaValidationWarning,
    ModerateIsaValidationWarning,
)
from .helpers import is_ontology_term_ref
from . import models
from .validate_assay_study import _OntologyTermRefValidator


__author__ = "Mathias Kuhring <mathias.kuhring@bihealth.de>"


# Pattern and helper functions for validation ------------------------------------------------------


# DATE_PATTERN = re.compile("^\\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])$")
MAIL_PATTERN = re.compile("^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$")
PHONE_PATTERN = re.compile("^\\+?[\\d /()-]+$")  # only checks characters!
DOI_PATTERN = re.compile("^(?:(?:DOI|doi):)?10[.][0-9]{4,}(?:[.][0-9]+)*/\\S+$")
PMID_PATTERN = re.compile("^\\d+$")


def _validate_mail_address(mail_address) -> str:
    """Helper function to validate mail strings"""
    if mail_address and not MAIL_PATTERN.match(mail_address):
        tpl = "Invalid mail address: {}"
        msg = tpl.format(mail_address)
        warnings.warn(msg, AdvisoryIsaValidationWarning)


def _validate_phone_number(phone_number) -> str:
    """Helper function to validate phone/fax number strings"""
    if phone_number and not PHONE_PATTERN.match(phone_number):
        tpl = "Invalid phone/fax number: {}"
        msg = tpl.format(phone_number)
        warnings.warn(msg, AdvisoryIsaValidationWarning)


def _validate_doi(doi) -> str:
    """Helper function to validate doi strings"""
    if doi and not DOI_PATTERN.match(doi):
        tpl = "Invalid doi string: {}"
        msg = tpl.format(doi)
        warnings.warn(msg, AdvisoryIsaValidationWarning)


def _validate_pubmed_id(pubmed_id) -> str:
    """Helper function to validate pubmed id strings"""
    if pubmed_id and not PMID_PATTERN.match(pubmed_id):
        tpl = "Invalid pubmed_id string: {}"
        msg = tpl.format(pubmed_id)
        warnings.warn(msg, AdvisoryIsaValidationWarning)


# Validator classes --------------------------------------------------------------------



[docs]
class InvestigationValidator:
    """
    Validator for Investigation

    :type investigation: models.InvestigationInfo
    :param investigation: The investigation model to validate
    """

    def __init__(self, investigation: models.InvestigationInfo):
        self._investigation = investigation
        self._ontology_validator = _OntologyTermRefValidator(investigation.ontology_source_refs)
        self._study_ids = set()
        self._study_paths = set()
        self._study_titles = set()
        self._assay_paths = set()


[docs]
    def validate(self):
        """Validate the investigation"""
        self._validate_ontology_sources()
        self._validate_sections()


    def _validate_ontology_sources(self):
        for source in self._investigation.ontology_source_refs.values():
            # Check that ontology sources are complete
            if not all((source.name, source.file, source.version, source.description)):
                tpl = "Incomplete ontology source; found: {}, {}, {}, {}, {}"
                msg = tpl.format(
                    source.name, source.file, source.version, source.description, source.comments
                )
                warnings.warn(msg, CriticalIsaValidationWarning)
            # Check that ontology source names contain no whitespaces
            if re.search("\\s", source.name):
                tpl = "Ontology source name including whitespace(s); found: {}, {}, {}, {}, {}"
                msg = tpl.format(
                    source.name, source.file, source.version, source.description, source.comments
                )
                warnings.warn(msg, AdvisoryIsaValidationWarning)

    def _validate_sections(self):
        self._validate_investigation_info()
        self._validate_publications(self._investigation.publications)
        self._validate_contacts(self._investigation.contacts)
        self._validate_studies()

    def _validate_investigation_info(self):
        info = self._investigation.info
        # If only one study is available, metadata should be recorded in the study section
        # (https://isa-specs.readthedocs.io/en/latest/isatab.html#investigation-section)
        if len(self._investigation.studies) == 1:
            if any((info.title, info.description, info.submission_date, info.public_release_date)):
                tpl = (
                    "Investigation with only one study contains metadata:\n\tID:\t{}\n\tTitle:\t"
                    "{}\n\tPath:\t{}\n\tSubmission Date:\t{}\n\tPublic Release Date:\t{"
                    "}\n\tPrefer recording metadata in the study section."
                )
                msg = tpl.format(
                    info.identifier,
                    info.title,
                    info.path or "",
                    info.description,
                    info.submission_date,
                    info.public_release_date,
                )
                warnings.warn(msg, ModerateIsaValidationWarning)
        # If more than one study is available, investigation should at least contain an id and title
        else:
            # Validate availability of investigation identifier
            if not info.identifier:
                tpl = "Investigation without identifier:\nTitle:\t{}\nPath:\t{}"
                msg = tpl.format(info.title, info.path or "")
                warnings.warn(msg, ModerateIsaValidationWarning)
            # Validate availability of investigation title
            if not info.title:
                tpl = "Investigation without title:\nID:\t{}\nPath:\t{}"
                msg = tpl.format(info.identifier, info.path or "")
                warnings.warn(msg, ModerateIsaValidationWarning)

    def _validate_studies(self):
        # Check if any study exists
        if not self._investigation.studies:
            tpl = "No studies declared in investigation: {}"
            msg = tpl.format(self._investigation.info.path)
            warnings.warn(msg, CriticalIsaValidationWarning)
            return
        for study in self._investigation.studies:
            # Validate availability of minimal study information (ids, paths, titles)
            if not (study.info.identifier and study.info.path):
                tpl = (
                    "Study with incomplete minimal information (ID and path):"
                    "\nID:\t{}\nTitle:\t{}\nPath:\t{}"
                )
                msg = tpl.format(study.info.identifier, study.info.title, study.info.path or "")
                warnings.warn(msg, CriticalIsaValidationWarning)
            if not study.info.title:
                tpl = "Study without title:\nID:\t{}\nTitle:\t{}\nPath:\t{}"
                msg = tpl.format(study.info.identifier, study.info.title, study.info.path or "")
                warnings.warn(msg, ModerateIsaValidationWarning)
            # Assure distinct studies, i.e. unique ids, paths and preferably titles
            if study.info.identifier in self._study_ids:
                tpl = "Study identifier used more than once: {}"
                msg = tpl.format(study.info.identifier)
                warnings.warn(msg, CriticalIsaValidationWarning)
            else:
                self._study_ids.add(study.info.identifier)
            if study.info.path:
                if study.info.path in self._study_paths:
                    tpl = "Study path used more than once: {}"
                    msg = tpl.format(study.info.path or "")
                    warnings.warn(msg, CriticalIsaValidationWarning)
                else:
                    self._study_paths.add(study.info.path)
            if study.info.title:
                if study.info.title in self._study_titles:
                    tpl = "Study title used more than once: {}"
                    msg = tpl.format(study.info.title)
                    warnings.warn(msg, ModerateIsaValidationWarning)
                else:
                    self._study_titles.add(study.info.title)
            # Validate study sections
            self._validate_publications(study.publications)
            self._validate_contacts(study.contacts)
            self._validate_designs(study.designs)
            self._validate_factors(study.factors)
            self._validate_assays(study.assays, study.info.identifier)
            self._validate_protocols(study.protocols)

    def _validate_publications(self, publications: Tuple[models.PublicationInfo]):
        # Validate format of specific fields in publications
        for publication in publications:
            _validate_pubmed_id(publication.pubmed_id)
            _validate_doi(publication.doi)
            if is_ontology_term_ref(publication.status):
                self._ontology_validator.validate(publication.status)

    def _validate_contacts(self, contacts: Tuple[models.ContactInfo]):
        # Validate format of specific fields in contacts
        for contact in contacts:
            _validate_mail_address(contact.email)
            _validate_phone_number(contact.phone)
            _validate_phone_number(contact.fax)
            if is_ontology_term_ref(contact.role):
                self._ontology_validator.validate(contact.role)

    def _validate_designs(self, designs: Tuple[models.DesignDescriptorsInfo]):
        # Validate format of specific fields in designs
        for design in designs:
            if is_ontology_term_ref(design.type):
                self._ontology_validator.validate(design.type)

    def _validate_factors(self, factors: Dict[str, models.FactorInfo]):
        # Validate format of specific fields in factors
        for factor in factors.values():
            if is_ontology_term_ref(factor.type):
                self._ontology_validator.validate(factor.type)

    def _validate_assays(self, assays: Tuple[models.AssayInfo], study_id: str):
        # Check if any assays exists (according to specs, having an assays is not mandatory)
        if not assays:
            tpl = "No assays declared in study '{}' of investigation '{}'"
            msg = tpl.format(study_id, self._investigation.info.path)
            warnings.warn(msg, AdvisoryIsaValidationWarning)
            return
        for assay in assays:
            # Validate availability of minimal assay information
            # (path, measurement type, technology type and technology platform)
            meas_type = (
                assay.measurement_type.name
                if is_ontology_term_ref(assay.measurement_type)
                else assay.measurement_type
            )
            tech_type = (
                assay.technology_type.name
                if is_ontology_term_ref(assay.technology_type)
                else assay.technology_type
            )
            if not (assay.path and meas_type and tech_type):
                tpl = (
                    "Assay with incomplete minimal information (path, measurement and "
                    "technology type):\nPath:\t{}\nMeasurement Type:\t{}\nTechnology Type:\t{"
                    "}\nTechnology Platform:\t{}"
                )
                msg = tpl.format(assay.path or "", meas_type, tech_type, assay.platform)
                warnings.warn(msg, CriticalIsaValidationWarning)
            if not assay.platform:
                tpl = (
                    "Assay without platform:\nPath:\t{}"
                    "\nMeasurement Type:\t{}\nTechnology Type:\t{}\nTechnology Platform:\t{}"
                )
                msg = tpl.format(assay.path or "", meas_type, tech_type, assay.platform)
                warnings.warn(msg, AdvisoryIsaValidationWarning)
            # Assure distinct assays, i.e. unique paths
            if assay.path:
                if assay.path in self._assay_paths:
                    tpl = "Assay path used more than once: {}"
                    msg = tpl.format(assay.path or "")
                    warnings.warn(msg, CriticalIsaValidationWarning)
                else:
                    self._assay_paths.add(assay.path)
            # Validate format of specific fields in assays
            if is_ontology_term_ref(assay.measurement_type):
                self._ontology_validator.validate(assay.measurement_type)
            if is_ontology_term_ref(assay.technology_type):
                self._ontology_validator.validate(assay.technology_type)

    def _validate_protocols(self, protocols: Dict[str, models.ProtocolInfo]):
        # Validate format of specific fields in protocols
        for protocol in protocols.values():
            if is_ontology_term_ref(protocol.type):
                self._ontology_validator.validate(protocol.type)
            for parameter in protocol.parameters.values():
                if is_ontology_term_ref(parameter):
                    self._ontology_validator.validate(parameter)
            for component in protocol.components.values():
                if is_ontology_term_ref(component.type):
                    self._ontology_validator.validate(component.type)