Source code for altamisa.isatab.headers
# -*- coding: utf-8 -*-
"""This module contains code for the representation of headers from study and
assay files and parsing thereof.
"""
from __future__ import generator_stop
from typing import Iterator, List
from ..constants import table_headers
from ..exceptions import ParseIsatabException
__author__ = "Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>"
[docs]
class ColumnHeader:
"""Column header in a study or assay file"""
def __init__(self, column_type, col_no, span):
#: The type of this header
self.column_type = column_type
#: The column number this header refers to
self.col_no = col_no
#: Number of columns this header spans
self.span = span
#: Link to the TermSourceRefHeader to use
self.term_source_ref_header = None
#: Link to the UnitHeader to use
self.unit_header = None
def __str__(self):
tpl = "ColumnHeader(column_type={}, col_no={}, span={})"
return tpl.format(*map(repr, (self.column_type, self.col_no, self.span)))
def __repr__(self):
return str(self)
[docs]
def get_simple_string(self) -> List[str]:
"""Return a list of simple string representations of the column types"""
return [self.column_type]
[docs]
class SimpleColumnHeader(ColumnHeader):
"""Base class for simple column headers."""
#: The value to use for the ``type`` argument.
column_type = None
def __init__(self, col_no):
super().__init__(self.column_type, col_no, 1)
# Material header
[docs]
class ExtractHeader(SimpleColumnHeader):
"""Extract header in an assay"""
column_type = table_headers.EXTRACT_NAME
[docs]
class LabeledExtractHeader(SimpleColumnHeader):
"""Labeled Extract header in an assay"""
column_type = table_headers.LABELED_EXTRACT_NAME
[docs]
class LibraryHeader(SimpleColumnHeader):
"""Library header in an assay"""
column_type = table_headers.LIBRARY_NAME
[docs]
class SampleHeader(SimpleColumnHeader):
"""Sample header in a study or assay"""
column_type = table_headers.SAMPLE_NAME
[docs]
class SourceHeader(SimpleColumnHeader):
"""Source header in a study"""
column_type = table_headers.SOURCE_NAME
# Data header
[docs]
class ArrayDataFileHeader(SimpleColumnHeader):
"""ArrayData header in an assay"""
column_type = table_headers.ARRAY_DATA_FILE
[docs]
class ArrayDataMatrixFileHeader(SimpleColumnHeader):
"""ArrayData Matrix File header in an assay"""
column_type = table_headers.ARRAY_DATA_MATRIX_FILE
[docs]
class ArrayDesignFileHeader(SimpleColumnHeader):
"""ArrayDesignFile header in an assay"""
column_type = table_headers.ARRAY_DESIGN_FILE
[docs]
class DerivedArrayDataFileHeader(SimpleColumnHeader):
"""DerivedArrayData header in an assay"""
column_type = table_headers.DERIVED_ARRAY_DATA_FILE
[docs]
class DerivedArrayDataMatrixFileHeader(SimpleColumnHeader):
"""DerivedArrayData header in an assay"""
column_type = table_headers.DERIVED_ARRAY_DATA_MATRIX_FILE
[docs]
class DerivedDataFileHeader(SimpleColumnHeader):
"""Derived Data File header in an assay"""
column_type = table_headers.DERIVED_DATA_FILE
[docs]
class DerivedSpectralDataFileHeader(SimpleColumnHeader):
"""DerivedSpectralData header in an assay"""
column_type = table_headers.DERIVED_SPECTRAL_DATA_FILE
[docs]
class ImageFileHeader(SimpleColumnHeader):
"""Image File header in an assay"""
column_type = table_headers.IMAGE_FILE
[docs]
class MetaboliteAssignmentFileHeader(SimpleColumnHeader):
"""PeptideAssignment header in an assay"""
column_type = table_headers.METABOLITE_ASSIGNMENT_FILE
[docs]
class PeptideAssignmentFileHeader(SimpleColumnHeader):
"""PeptideAssignment header in an assay"""
column_type = table_headers.PEPTIDE_ASSIGNMENT_FILE
[docs]
class PostTranslationalModificationAssignmentFileHeader(SimpleColumnHeader):
"""PostTranslationalModificationAssignment header in an assay"""
column_type = table_headers.POST_TRANSLATIONAL_MODIFICATION_ASSIGNMENT_FILE
[docs]
class ProteinAssignmentFileHeader(SimpleColumnHeader):
"""ProteinAssignment header in an assay"""
column_type = table_headers.PROTEIN_ASSIGNMENT_FILE
[docs]
class RawDataFileHeader(SimpleColumnHeader):
"""Raw Data header in an assay"""
column_type = table_headers.RAW_DATA_FILE
[docs]
class RawSpectralDataFileHeader(SimpleColumnHeader):
"""Raw Spectral Data header in an assay"""
column_type = table_headers.RAW_SPECTRAL_DATA_FILE
[docs]
class SpotPickingFileHeader(SimpleColumnHeader):
"""SpotPickingFile header in an assay"""
column_type = table_headers.SPOT_PICKING_FILE
# Assay header
[docs]
class AssayNameHeader(SimpleColumnHeader):
"""Assay Name header in an assay"""
column_type = table_headers.ASSAY_NAME
[docs]
class DataTransformationNameHeader(SimpleColumnHeader):
"""DataTransformationName header in an assay"""
column_type = table_headers.DATA_TRANSFORMATION_NAME
[docs]
class GelElectrophoresisAssayNameHeader(SimpleColumnHeader):
"""GelElectrophoresisAssayName header in an assay"""
column_type = table_headers.GEL_ELECTROPHORESIS_ASSAY_NAME
[docs]
class HybridizationAssayNameHeader(SimpleColumnHeader):
"""HybridizationAssayName header in an assay"""
column_type = table_headers.HYBRIDIZATION_ASSAY_NAME
[docs]
class MsAssayNameHeader(SimpleColumnHeader):
"""MsAssayName header in an assay"""
column_type = table_headers.MS_ASSAY_NAME
[docs]
class NormalizationNameHeader(SimpleColumnHeader):
"""Normalization Name header in a assay"""
column_type = table_headers.NORMALIZATION_NAME
[docs]
class ProtocolRefHeader(SimpleColumnHeader):
"""Protocol REF header in a study or assay"""
column_type = table_headers.PROTOCOL_REF
[docs]
class ScanNameHeader(SimpleColumnHeader):
"""ScanName header in assay"""
column_type = table_headers.SCAN_NAME
# Other simple header
[docs]
class ArrayDesignRefHeader(SimpleColumnHeader):
"""ArrayDesignRef header in an assay"""
column_type = table_headers.ARRAY_DESIGN_REF
[docs]
class DateHeader(SimpleColumnHeader):
"""Date annotation header in a study or assay"""
column_type = table_headers.DATE
[docs]
class FirstDimensionHeader(SimpleColumnHeader):
"""First Dimension header in an assay"""
column_type = table_headers.FIRST_DIMENSION
[docs]
class LabelHeader(SimpleColumnHeader):
"""Label header in an assay"""
column_type = table_headers.LABEL
[docs]
class MaterialTypeHeader(SimpleColumnHeader):
"""Material Type header in an assay"""
column_type = table_headers.MATERIAL_TYPE
[docs]
class PerformerHeader(SimpleColumnHeader):
"""Performer header in an assay"""
column_type = table_headers.PERFORMER
[docs]
class SecondDimensionHeader(SimpleColumnHeader):
"""Second Dimension header in an assay"""
column_type = table_headers.SECOND_DIMENSION
[docs]
class TermRefAnnotationHeader(ColumnHeader):
"""Term reference annotation header"""
def __init__(self, col_no):
super().__init__(table_headers.TERM_SOURCE_REF, col_no, 2)
[docs]
def get_simple_string(self) -> List[str]:
"""Return a list of simple string representations of the column types"""
return [table_headers.TERM_SOURCE_REF, table_headers.TERM_ACCESSION_NUMBER]
[docs]
class UnitHeader(SimpleColumnHeader):
"""Unit annotation header in a study or assay"""
column_type = table_headers.UNIT
# Labeled header
[docs]
class LabeledColumnHeader(ColumnHeader):
"""Base class for labeled column headers."""
#: The value to use for the ``type`` argument.
column_type = None
def __init__(self, col_no, label):
super().__init__(self.column_type, col_no, 1)
self.label = label
def __str__(self):
tpl = "LabeledColumnHeader(column_type={}, col_no={}, span={}, label={})"
return tpl.format(*map(repr, (self.column_type, self.col_no, self.span, self.label)))
def __repr__(self):
return str(self)
[docs]
def get_simple_string(self):
"""Return a list of simple string representations of the column types"""
return ["".join((self.column_type, "[", self.label, "]"))]
[docs]
class CharacteristicsHeader(LabeledColumnHeader):
"""Material ``Characteristics[*]`` header in a study or assay"""
column_type = table_headers.CHARACTERISTICS
[docs]
class CommentHeader(LabeledColumnHeader):
"""``Comment`` header in a study or assay"""
column_type = table_headers.COMMENT
[docs]
class FactorValueHeader(LabeledColumnHeader):
"""``Factor Value[*]`` header in a study or assay"""
column_type = table_headers.FACTOR_VALUE
[docs]
class ParameterValueHeader(LabeledColumnHeader):
"""Protocol ``Parameter Value[*]`` header in a study or assay"""
column_type = table_headers.PARAMETER_VALUE
# Header parsers
[docs]
class HeaderParserBase:
"""
Helper base class for parsing a header from a study or assay file.
:type tokens: list
:param tokens: List of strings, e.g. a split line read from a tsv/cvs file.
"""
#: Names of the allowed headers
allowed_headers = None
#: Headers that are mapped to ``SimpleColumnHeader``
simple_headers = {
# Material headers
table_headers.EXTRACT_NAME: ExtractHeader,
table_headers.LABELED_EXTRACT_NAME: LabeledExtractHeader,
table_headers.LIBRARY_NAME: LibraryHeader,
table_headers.SAMPLE_NAME: SampleHeader,
table_headers.SOURCE_NAME: SourceHeader,
# Data headers
table_headers.ARRAY_DATA_FILE: ArrayDataFileHeader,
table_headers.ARRAY_DATA_MATRIX_FILE: ArrayDataMatrixFileHeader,
table_headers.ARRAY_DESIGN_FILE: ArrayDesignFileHeader,
table_headers.DERIVED_ARRAY_DATA_FILE: DerivedArrayDataFileHeader,
table_headers.DERIVED_ARRAY_DATA_MATRIX_FILE: DerivedArrayDataMatrixFileHeader,
table_headers.DERIVED_DATA_FILE: DerivedDataFileHeader,
table_headers.DERIVED_SPECTRAL_DATA_FILE: DerivedSpectralDataFileHeader,
table_headers.IMAGE_FILE: ImageFileHeader,
table_headers.METABOLITE_ASSIGNMENT_FILE: MetaboliteAssignmentFileHeader,
table_headers.PEPTIDE_ASSIGNMENT_FILE: PeptideAssignmentFileHeader,
table_headers.POST_TRANSLATIONAL_MODIFICATION_ASSIGNMENT_FILE: PostTranslationalModificationAssignmentFileHeader,
table_headers.PROTEIN_ASSIGNMENT_FILE: ProteinAssignmentFileHeader,
table_headers.RAW_DATA_FILE: RawDataFileHeader,
table_headers.RAW_SPECTRAL_DATA_FILE: RawSpectralDataFileHeader,
table_headers.SPOT_PICKING_FILE: SpotPickingFileHeader,
# Process names or Ref
table_headers.ASSAY_NAME: AssayNameHeader,
table_headers.DATA_TRANSFORMATION_NAME: DataTransformationNameHeader,
table_headers.GEL_ELECTROPHORESIS_ASSAY_NAME: GelElectrophoresisAssayNameHeader,
table_headers.HYBRIDIZATION_ASSAY_NAME: HybridizationAssayNameHeader,
table_headers.MS_ASSAY_NAME: MsAssayNameHeader,
table_headers.NORMALIZATION_NAME: NormalizationNameHeader,
table_headers.PROTOCOL_REF: ProtocolRefHeader,
table_headers.SCAN_NAME: ScanNameHeader,
# Simple headers
table_headers.ARRAY_DESIGN_REF: ArrayDesignRefHeader,
table_headers.DATE: DateHeader,
table_headers.FIRST_DIMENSION: FirstDimensionHeader,
table_headers.LABEL: LabelHeader,
table_headers.MATERIAL_TYPE: MaterialTypeHeader,
table_headers.PERFORMER: PerformerHeader,
table_headers.SECOND_DIMENSION: SecondDimensionHeader,
# Secondary annotations
table_headers.UNIT: UnitHeader,
}
#: Labeled headers
labeled_headers = {
table_headers.CHARACTERISTICS: CharacteristicsHeader,
table_headers.COMMENT: CommentHeader,
table_headers.FACTOR_VALUE: FactorValueHeader,
table_headers.PARAMETER_VALUE: ParameterValueHeader,
}
def __init__(self, tokens):
self.tokens = tokens
self.it = iter(tokens)
self.col_no = 0
[docs]
def run(self) -> Iterator[ColumnHeader]:
"""Parse the header"""
while True:
try:
yield self._parse_next()
except StopIteration:
break
def _parse_next(self):
# Get next value from header
val = next(self.it) # StopIteration is OK here
# Process either by exact match to "Term Source REF", or other exact
# matches, or any of the prefix matches (e.g., "Comment[Label])"
if val == table_headers.TERM_SOURCE_REF:
return self._parse_term_source_ref()
elif val in self.simple_headers:
if val not in self.allowed_headers:
tpl = 'Header "{}" not allowed in {}.'
msg = tpl.format(val, self.file_type)
raise ParseIsatabException(msg)
return self._parse_simple_column_header(self.simple_headers[val])
else:
for label, type_ in self.labeled_headers.items():
if val.startswith(label):
if label not in self.allowed_headers:
tpl = 'Header "{}" not allowed in {}.'
msg = tpl.format(label, self.file_type)
raise ParseIsatabException(msg)
return self._parse_labeled_column_header(val, label, type_)
# None of the if-statements above was taken
tpl = 'Header "{}" unknown, processing unclear'
msg = tpl.format(val)
raise ParseIsatabException(msg)
def _parse_term_source_ref(self):
# Getting a StopIteration here is NOT okay, there must be a column
# after "Term Source REF" giving the ontology the term is from.
try:
val = next(self.it)
except StopIteration as e:
msg = 'Expected one more column on seeing "Term Source REF"'
raise ParseIsatabException(msg) from e
if val != table_headers.TERM_ACCESSION_NUMBER:
msg = 'Expected column "Term Accession Number" after seeing "Term Source REF"'
raise ParseIsatabException(msg)
self.col_no += 2
return TermRefAnnotationHeader(self.col_no - 2)
def _parse_simple_column_header(self, type_):
self.col_no += 1
return type_(self.col_no - 1)
def _parse_labeled_column_header(self, val, key, type_):
tok = val[len(key) :] # strip '^{key}'
if not tok or tok[0] != "[" or tok[-1] != "]":
tpl = "Problem parsing labeled header {}"
msg = tpl.format(val)
raise ParseIsatabException(msg)
self.col_no += 1
return type_(self.col_no - 1, tok[1:-1])
[docs]
class StudyHeaderParser(HeaderParserBase):
"""Helper class for parsing header of a study or assay."""
file_type = "study" # for exceptions only
allowed_headers = (
# Material names
table_headers.SAMPLE_NAME,
table_headers.SOURCE_NAME,
# Process names
table_headers.PROTOCOL_REF,
# Simple headers
table_headers.DATE,
table_headers.PERFORMER,
# Labeled headers
table_headers.CHARACTERISTICS,
table_headers.COMMENT,
table_headers.FACTOR_VALUE,
table_headers.PARAMETER_VALUE,
# Secondary annotations
table_headers.TERM_SOURCE_REF,
table_headers.UNIT,
)
[docs]
class AssayHeaderParser(HeaderParserBase):
"""Helper class for parsing header of a assay file."""
file_type = "assay" # for exceptions only
allowed_headers = (
# Material names
table_headers.EXTRACT_NAME,
table_headers.LABELED_EXTRACT_NAME,
table_headers.LIBRARY_NAME,
table_headers.SAMPLE_NAME,
# Data names
table_headers.ARRAY_DATA_FILE,
table_headers.ARRAY_DATA_MATRIX_FILE,
table_headers.ARRAY_DESIGN_FILE,
table_headers.DERIVED_ARRAY_DATA_FILE,
table_headers.DERIVED_ARRAY_DATA_MATRIX_FILE,
table_headers.DERIVED_DATA_FILE,
table_headers.DERIVED_SPECTRAL_DATA_FILE,
table_headers.IMAGE_FILE,
table_headers.METABOLITE_ASSIGNMENT_FILE,
table_headers.PEPTIDE_ASSIGNMENT_FILE,
table_headers.POST_TRANSLATIONAL_MODIFICATION_ASSIGNMENT_FILE,
table_headers.PROTEIN_ASSIGNMENT_FILE,
table_headers.RAW_DATA_FILE,
table_headers.RAW_SPECTRAL_DATA_FILE,
table_headers.SPOT_PICKING_FILE,
# Process names or Ref
table_headers.ASSAY_NAME,
table_headers.DATA_TRANSFORMATION_NAME,
table_headers.GEL_ELECTROPHORESIS_ASSAY_NAME,
table_headers.HYBRIDIZATION_ASSAY_NAME,
table_headers.MS_ASSAY_NAME,
table_headers.NORMALIZATION_NAME,
table_headers.PROTOCOL_REF,
table_headers.SCAN_NAME,
# Simple headers
table_headers.ARRAY_DESIGN_REF,
table_headers.DATE,
table_headers.FIRST_DIMENSION,
table_headers.LABEL,
table_headers.MATERIAL_TYPE,
table_headers.PERFORMER,
table_headers.SECOND_DIMENSION,
# Labeled headers
table_headers.CHARACTERISTICS,
table_headers.COMMENT,
table_headers.PARAMETER_VALUE,
# Secondary annotations
table_headers.TERM_SOURCE_REF,
table_headers.UNIT,
)