MZTabHeaderLineParser.java
/*
* Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V..
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.ac.ebi.pride.jmztab2.utils.parser;
import de.isas.mztab2.model.Assay;
import de.isas.mztab2.model.Metadata;
import de.isas.mztab2.model.MsRun;
import de.isas.mztab2.model.Parameter;
import de.isas.mztab2.model.SmallMoleculeSummary;
import de.isas.mztab2.model.StudyVariable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uk.ac.ebi.pride.jmztab2.model.MZBoolean;
import uk.ac.ebi.pride.jmztab2.model.MZTabColumn;
import uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory;
import uk.ac.ebi.pride.jmztab2.model.MZTabConstants;
import uk.ac.ebi.pride.jmztab2.model.Section;
import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType;
import uk.ac.ebi.pride.jmztab2.utils.errors.LogicalErrorType;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException;
/**
* A couple of common method used to parse a header line into {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} structure.
*
* NOTICE: {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} maintain a couple of {@link MZTabColumn} which have internal logical
* position and order. In physical mzTab file, we allow user not obey this logical position organized way,
* and provide their date with own order. In order to distinguish them, we use physical position (a positive
* integer) to record the column location in mzTab file. And use {@link uk.ac.ebi.pride.jmztab2.utils.parser.PositionMapping} structure the maintain
* the mapping between them.
*
* @author qingwei
* @see SMHLineParser
* @see SMFLineParser
* @see SMELineParser
* @since 11/02/13
*
*/
public abstract class MZTabHeaderLineParser extends MZTabLineParser {
protected MZTabColumnFactory factory;
protected Metadata metadata;
/**
* Parse a header line into {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} structure.
*
* @param context the parser context, keeping dynamic state and lookup associations.
* @param factory SHOULD NOT set null
* @param metadata SHOULD NOT set null
*/
protected MZTabHeaderLineParser(MZTabParserContext context, MZTabColumnFactory factory, Metadata metadata) {
super(context);
if (factory == null) {
throw new NullPointerException("Header line should be parsed first!");
}
this.factory = factory;
if (metadata == null) {
throw new NullPointerException("Metadata should be created first!");
}
this.metadata = metadata;
}
/**
* {@inheritDoc}
*
* Parse a header line into {@link MZTabColumnFactory} structure. There are several steps in this method:
* Step 1: {@link #parseColumns()} focus on validate and parse all columns.
* Step 2: {@link #refine()}
*/
@Override
public void parse(int lineNumber, String line, MZTabErrorList errorList) throws MZTabException {
super.parse(lineNumber, line, errorList);
int offset = parseColumns();
if (offset != items.length) {
this.errorList.add(new MZTabError(LogicalErrorType.HeaderLine, lineNumber, section.getName(), "" + offset, "" + items.length));
}
refine();
}
/**
* This methods delegates to the subclasses the parsing of the columns. All of the columns are defined in
* {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeColumn}, {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeFeatureColumn}, or {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeEvidenceColumn}.
*
* @return the next physical index of column available after the parsing.
* @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
*/
protected abstract int parseColumns() throws MZTabException;
/**
* Some validate operation need to be done after the whole {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} created.
* Thus, user can add them, and called at the end of the
* {@link #parse(int, String, MZTabErrorList)} method.
*
* @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
*/
protected abstract void refine() throws MZTabException;
/**
* Refine optional columns and check, whether they were properly defined.
* These re-validate operation will called in {@link #refine()} method.
*
* @param section a {@link Section} object defining the part of the document.
* @param columnHeader a {@link java.lang.String} object.
* @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
*/
protected void refineOptionalColumn(Section section, String columnHeader) throws MZTabException {
if (factory.findColumnByHeader(columnHeader) == null) {
throw new MZTabException(new MZTabError(LogicalErrorType.NotDefineInHeader, lineNumber, columnHeader, section.getName()));
}
}
/**
* <p>fromIndexToOrder.</p>
*
* @param index a {@link java.lang.Integer} object.
* @return a {@link java.lang.String} object.
*/
protected String fromIndexToOrder(Integer index) {
return String.format("%02d", index);
}
/**
* Additional columns can be added to the end of the protein table. These column headers MUST start with the prefix "opt_".
* Column names MUST only contain the following characters: 'A'-'Z', 'a'-'z', '0'-'9', '_', '-', '[', ']', and ':'.
*
* the format: opt_{IndexedElement[id]}_{value}. Spaces within the parameter's name MUST be replaced by '_'.
*
* @param nameLabel a {@link java.lang.String} object.
* @return a boolean.
* @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
*/
protected boolean checkOptColumnName(String nameLabel) throws MZTabException {
nameLabel = nameLabel.trim();
String regexp = MZTabConstants.REGEX_OPT_COLUMN_NAME;
Pattern pattern = Pattern.compile(regexp);
Matcher matcher = pattern.matcher(nameLabel);
Integer id;
String object_id;
String value;
MZTabError error;
if (matcher.find()) {
object_id = matcher.group(1);
value = matcher.group(4);
Parameter param = null;
if (value.startsWith(MZTabConstants.CV_PREFIX)) {
param = checkCVParamOptColumnName(nameLabel, value);
}
Class dataType = getDataType(param);
if (object_id.contains(MZTabConstants.GLOBAL)) {
if (param == null) {
factory.addOptionalColumn(value, dataType);
} else {
factory.addOptionalColumn(param, dataType);
}
} else {
id = parseIndex(nameLabel, matcher.group(3));
if (object_id.contains(Metadata.Properties.assay.getPropertyName())) {
Assay element = context.getAssayMap().get(id);
// not found assay_id in metadata.
if (element == null) {
error = new MZTabError(LogicalErrorType.AssayNotDefined, lineNumber, nameLabel);
throw new MZTabException(error);
} else if (param == null) {
factory.addOptionalColumn(element, value, dataType);
} else {
factory.addOptionalColumn(element, param, dataType);
}
} else if (object_id.contains(Metadata.Properties.studyVariable.getPropertyName())) {
StudyVariable element = context.getStudyVariableMap().get(id);
// not found study_variable_id in metadata.
if (element == null) {
error = new MZTabError(LogicalErrorType.StudyVariableNotDefined, lineNumber, nameLabel);
throw new MZTabException(error);
} else if (param == null) {
factory.addOptionalColumn(element, value, dataType);
} else {
factory.addOptionalColumn(element, param, dataType);
}
} else if (object_id.contains(Metadata.Properties.msRun.getPropertyName())) {
// not found ms_run_id in metadata.
MsRun element = context.getMsRunMap().get(id);
if (element == null) {
error = new MZTabError(LogicalErrorType.MsRunNotDefined, lineNumber, nameLabel);
throw new MZTabException(error);
} else if (param == null) {
factory.addOptionalColumn(element, value, dataType);
} else {
factory.addOptionalColumn(element, param, dataType);
}
}
}
return true;
} else {
throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel));
}
}
/**
* An kind of {@link CVParamOptionColumn} which use CV parameter accessions in following the format:
* opt_{OBJECT_ID}_cv_{accession}_{parameter name}. Spaces within the parameter' s name MUST be replaced by '_'.
*/
private Parameter checkCVParamOptColumnName(String nameLabel, String valueLabel) throws MZTabException {
nameLabel = nameLabel.trim();
valueLabel = valueLabel.trim();
String regexp = MZTabConstants.REGEX_CV_PARAM_OPT_COLUMN_NAME;
Pattern pattern = Pattern.compile(regexp);
Matcher matcher = pattern.matcher(valueLabel);
Parameter param;
if (!matcher.find() || matcher.end() != valueLabel.length()) {
throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel));
} else {
String accession = matcher.group(2);
String name = matcher.group(4);
if (name == null || name.trim().length() == 0) {
throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel));
}
param = matcher.group(4) == null ? null : new Parameter().cvAccession(accession).name(name);
}
return param;
}
/**
* Some {@link CVParamOptionColumn}, their data type have defined. Currently, we provide two {@link Parameter}
* which defined in the mzTab specification. One is "emPAI value" (MS:1001905), data type is Double;
* another is "decoy peptide" (MS:1002217), the data type is Boolean (0/1). Besides them, "opt_" start optional
* column data type is String.
*
* @see #checkOptColumnName(String)
*/
private Class getDataType(Parameter param) {
Class dataType;
if (param == null) {
dataType = String.class;
} else if (param.getCvAccession().equals("MS:1001905")) {
dataType = Double.class;
} else if (param.getCvAccession().equals("MS:1002217")) {
dataType = MZBoolean.class;
} else if (param.getCvAccession().equals("PRIDE:0000303")) {
dataType = MZBoolean.class;
} else {
dataType = String.class;
}
return dataType;
}
/**
* <p>checkAbundanceColumns.</p>
*
* @param offset a int.
* @param order a {@link java.lang.String} object.
* @return a int.
* @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
*/
protected int checkAbundanceColumns(int offset, String order) throws MZTabException {
String headerString = items[offset];
if (headerString.contains(SmallMoleculeSummary.Properties.abundanceAssay.getPropertyName())) {
checkAbundanceAssayColumn(headerString, order);
return offset;
} else if (headerString.contains(SmallMoleculeSummary.Properties.abundanceStudyVariable.getPropertyName()) || headerString.contains(SmallMoleculeSummary.Properties.abundanceVariationStudyVariable.getPropertyName())) {
checkAbundanceStudyVariableColumns(headerString, order);
return offset;
} else {
MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, headerString);
throw new MZTabException(error);
}
}
/**
* Check (protein|peptide|smallmolecule)_abundance is correct, and return object value label.
* For example, protein_abundance_std_error_study_variable[id], return study_variable[id].
*/
private String checkAbundanceSection(String abundanceHeader) throws MZTabException {
abundanceHeader = abundanceHeader.trim().toLowerCase();
Pattern pattern = Pattern.compile(MZTabConstants.REGEX_ABUNDANCE_COLUMN_NAME);
Matcher matcher = pattern.matcher(abundanceHeader);
if (matcher.find()) {
// String sectionName = matcher.group(1);
// if (sectionName != null &&
// !(sectionName.equals(Section.Protein.getName()) && section != Section.Protein_Header) &&
// !(sectionName.equals(Section.Peptide.getName()) && section != Section.Peptide_Header) &&
// !(sectionName.equals(Section.Small_Molecule.getName()) && section != Section.Small_Molecule_Header)) {
return matcher.group(1);
// }
// MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
// throw new MZTabException(error);
} else {
MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
throw new MZTabException(error);
}
}
private void checkAbundanceAssayColumn(String abundanceHeader, String order) throws MZTabException {
String valueLabel = checkAbundanceSection(abundanceHeader);
Pattern pattern = Pattern.compile(MZTabConstants.REGEX_ABUNDANCE_ASSAY_COLUMN_NAME);
Matcher matcher = pattern.matcher(valueLabel);
if (!matcher.find()) {
MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
throw new MZTabException(error);
}
int id = parseIndex(abundanceHeader, matcher.group(1));
Assay assay = context.getAssayMap().get(id);
if (assay == null) {
MZTabError error = new MZTabError(LogicalErrorType.AssayNotDefined, lineNumber, abundanceHeader);
throw new MZTabException(error);
}
factory.addAbundanceOptionalColumn(assay, order);
}
private void checkAbundanceStudyVariableColumns(String header,
String order) throws MZTabException {
header = header.trim().toLowerCase();
if (!header.contains(SmallMoleculeSummary.Properties.abundanceStudyVariable.getPropertyName()
) && !header.contains(SmallMoleculeSummary.Properties.abundanceVariationStudyVariable.getPropertyName())) {
MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, header);
throw new MZTabException(error);
} else {
StudyVariable abundanceStudyVariable = checkAbundanceStudyVariableColumn(header);
//adds both abundance_study_variable and abundance_coeffvar_study_variable columns
factory.addAbundanceOptionalColumn(abundanceStudyVariable, checkAbundanceSection(header), order);
}
}
/**
* Check XXXX_abundance_study_variable[id], XXXX_abundance_stdev_study_variable[id], XXXX_abundance_std_error_study_variable[id]
* column header. If parse error, stop validate and raise {@link MZTabException}.
*/
private StudyVariable checkAbundanceStudyVariableColumn(String abundanceHeader) throws MZTabException {
String valueLabel = checkAbundanceSection(abundanceHeader);
Pattern pattern = Pattern.compile(MZTabConstants.REGEX_STUDY_VARIABLE_COLUMN_NAME);
Matcher matcher = pattern.matcher(valueLabel);
if (!matcher.find()) {
MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
throw new MZTabException(error);
}
int id = parseIndex(abundanceHeader, matcher.group(1));
StudyVariable studyVariable = context.getStudyVariableMap().get(id);
if (studyVariable == null) {
MZTabError error = new MZTabError(LogicalErrorType.StudyVariableNotDefined, lineNumber, abundanceHeader);
throw new MZTabException(error);
}
return studyVariable;
}
/**
* Parse header to a index id number.
* If exists parse error, stop validate and throw {@link uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException} directly.
*
* @param header a {@link java.lang.String} object.
* @param id a {@link java.lang.String} object.
* @return a int.
* @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
*/
protected int parseIndex(String header, String id) throws MZTabException {
try {
Integer index = Integer.parseInt(id);
if (index < 1) {
throw new NumberFormatException();
}
return index;
} catch (NumberFormatException e) {
MZTabError error = new MZTabError(LogicalErrorType.IdNumber, lineNumber, header, id);
throw new MZTabException(error);
}
}
/**
* <p>Getter for the field <code>factory</code>.</p>
*
* @return a {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} object.
*/
public MZTabColumnFactory getFactory() {
return factory;
}
}