MzTabFileParser.java

/* 
 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V..
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.isas.mztab2.io;

import de.isas.mztab2.model.ColumnParameterMapping;
import de.isas.mztab2.model.Comment;
import de.isas.mztab2.model.Metadata;
import de.isas.mztab2.model.MsRun;
import de.isas.mztab2.model.MzTab;
import de.isas.mztab2.model.SmallMoleculeEvidence;
import de.isas.mztab2.model.SmallMoleculeFeature;
import de.isas.mztab2.model.SmallMoleculeSummary;
import java.io.*;
import java.net.URI;
import java.net.URL;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import uk.ac.ebi.pride.jmztab2.model.IMZTabColumn;
import uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory;
import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.NEW_LINE;
import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.REGEX_DEFAULT_RELIABILITY;
import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.TAB;
import uk.ac.ebi.pride.jmztab2.model.MZTabStringUtils;
import uk.ac.ebi.pride.jmztab2.model.Section;
import static uk.ac.ebi.pride.jmztab2.utils.MZTabProperties.*;
import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType;
import uk.ac.ebi.pride.jmztab2.utils.errors.LogicalErrorType;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorOverflowException;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorType;
import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException;
import uk.ac.ebi.pride.jmztab2.utils.parser.COMLineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.MTDLineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext;
import uk.ac.ebi.pride.jmztab2.utils.parser.PositionMapping;
import uk.ac.ebi.pride.jmztab2.utils.parser.SEHLineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.SFHLineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.SMELineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.SMFLineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.SMHLineParser;
import uk.ac.ebi.pride.jmztab2.utils.parser.SMLLineParser;

/**
 *
 * MZTabFileParser provides reading functionality of the mzTab file. During the
 * parsing process, minimal integrity checks are preformed.
 *
 * @author qingwei
 * @author nilshoffmann
 * 
 * @since 21/02/13
 *
 */
public class MzTabFileParser {

    private MzTab mzTabFile;
    private URI tabFile;

    private MZTabErrorList errorList;
    private MZTabParserContext context;

    /**
     * Create a new {@code MZTabFileParser} for the given file.
     *
     * @param tabFile the MZTab file. The file SHOULD not be null and MUST exist
     * @throws java.lang.IllegalArgumentException if the provided argument in
     * invalid.
     */
    public MzTabFileParser(File tabFile) throws IllegalArgumentException {
        this(tabFile.toURI());
    }

    /**
     * Create a new {@code MZTabFileParser} for the given file URI.
     *
     * @param tabFileUri the MZTab file URI. The file SHOULD not be null and
     * MUST exist {@link uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList}
     * return by
     * {@link de.isas.mztab2.io.MzTabFileParser#getErrorList()}
     * @throws java.lang.IllegalArgumentException if the provided argument in
     * invalid.
     */
    public MzTabFileParser(URI tabFileUri) throws IllegalArgumentException {
        if (tabFileUri == null) {
            throw new IllegalArgumentException(
                "MZTab file uri must not be null!");
        }
        if (("file".equals(tabFileUri.getScheme()) && !new File(tabFileUri).
            exists())) {
            throw new IllegalArgumentException("MZTab File URI " + tabFileUri.
                toASCIIString() + " does not exist!");
        }

        this.tabFile = tabFileUri;
    }

    /**
     * Create a new {@code MZTabParserContext} and {@code MZTabErrorList} for
     * the given file URI. Parsing output and errors are written to the provided
     * {@link java.io.OutputStream}.
     *
     * @param out the output stream for parsing messages
     * @param level the minimum error level to report errors for
     * @param maxErrorCount the maximum number of errors to report in the
     * {@link uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList} return by
     * {@link de.isas.mztab2.io.MzTabFileParser#getErrorList()}
     * @return the error list
     * @throws java.io.IOException if any io related errors occur.
     */
    public MZTabErrorList parse(OutputStream out, MZTabErrorType.Level level,
        int maxErrorCount) throws IOException {
        try {
            context = new MZTabParserContext();
            errorList = new MZTabErrorList(level, maxErrorCount);
            check();
            refine();
        } catch (MZTabException e) {
            out.write(e.getMessage().getBytes());
            try (PrintStream ps = new PrintStream(out)) {
                e.printStackTrace(ps);
            }
            errorList.add(e.getError());
        } catch (MZTabErrorOverflowException e) {
            try (PrintStream ps = new PrintStream(out)) {
                e.printStackTrace(ps);
            }
            out.write(e.getMessage().getBytes());
        }

        errorList.print(out);
        if (mzTabFile != null && errorList.isEmpty()) {
            out.write(
                ("No structural or logical errors in " + tabFile + " file!" + NEW_LINE).
                    getBytes());
        }
        return errorList;
    }

    /**
     * Create a new {@code MZTabParserContext} and {@code MZTabErrorList} for
     * the given file URI. Parsing output and errors are written to the provided
     * {@link java.io.OutputStream}. Reports up to
     * {@link uk.ac.ebi.pride.jmztab2.utils.MZTabProperties#MAX_ERROR_COUNT}
     * errors.
     *
     * @param out the output stream for parsing messages
     * @param level the minimum error level to report errors for
     * @return the error list
     * @throws java.io.IOException if any io related errors occur.
     */
    public MZTabErrorList parse(OutputStream out, MZTabErrorType.Level level) throws IOException {
        return parse(out, level, MAX_ERROR_COUNT);
    }

    /**
     * Create a new {@code MZTabParserContext} and {@code MZTabErrorList} for
     * the given file URI. Parsing output and errors are written to the provided
     * {@link java.io.OutputStream}. Reports up to
     * {@link uk.ac.ebi.pride.jmztab2.utils.MZTabProperties#MAX_ERROR_COUNT}
     * errors on level
     * {@link uk.ac.ebi.pride.jmztab2.utils.MZTabProperties#LEVEL}.
     *
     * @param out the output stream for parsing messages
     * @return the error list
     * @throws java.io.IOException if any io related errors occur.
     */
    public MZTabErrorList parse(OutputStream out) throws IOException {
        return parse(out, LEVEL, MAX_ERROR_COUNT);
    }

    /**
     * <p>
     * Getter for the field <code>errorList</code>.</p>
     *
     * @return a {@link uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList}
     * object.
     */
    public MZTabErrorList getErrorList() {
        return errorList;
    }

    private Section getSection(String line) {
        String[] items = line.split("\\s*" + TAB + "\\s*");
        String section = items[0].trim();
        return Section.findSection(section);
    }

    private BufferedReader readFile(URI tabFile) throws IOException {
        BufferedReader reader;

        InputStream is;
        File file = new File(tabFile);
        if (file.isFile()) {
            is = new FileInputStream(file);
        } else {
            URL tabFileUrl = tabFile.toURL();
            is = tabFileUrl.openStream();
        }
        if (tabFile.getPath().
            endsWith(".gz")) {
            reader = new BufferedReader(new InputStreamReader(
                new GZIPInputStream(is), ENCODE));
        } else {
            reader = new BufferedReader(new InputStreamReader(
                is, ENCODE));
        }

        return reader;
    }

    private String subString(String source) {
        int length = 20;

        if (length >= source.length()) {
            return source;
        } else {
            return source.substring(0, length - 1) + "...";
        }
    }

    /**
     * refine all MZTabFile consistency correct.
     */
    private void refine() throws MZTabException, MZTabErrorOverflowException {
        if (mzTabFile == null) {
            return;
        }

        Metadata metadata = mzTabFile.getMetadata();

        //If ms_run[1-n]-hash is present,  ms_run[1-n]-hash_method SHOULD also be present
        for (MsRun msRun : metadata.getMsRun()) {
            if (msRun.getHash() != null && msRun.getHashMethod() == null) {
                throw new MZTabException(new MZTabError(
                    LogicalErrorType.MsRunHashMethodNotDefined, -1, msRun.
                        getId().
                        toString()));
            }
        }
    }

    /**
     * Query {@link MZTabErrorList} to check exist errors or not.
     *
     * @throws java.io.IOException
     * @throws uk.ac.ebi.pride.jmztab.utils.errors.MZTabException during parsing
     * of metadata,
     * protein/peptide/small_molecule/small_molecule_feature/small_molecule_evidence
     * header lines, if there exist any errors.
     * @throws uk.ac.ebi.pride.jmztab.utils.errors.MZTabErrorOverflowException
     * when too many errors are detected, as defined by the mztab.properties
     * file mztab.max_error_count parameter.
     */
    private void check() throws IOException, MZTabException, MZTabErrorOverflowException {
        COMLineParser comParser = new COMLineParser(context);
        MTDLineParser mtdParser = new MTDLineParser(context);
        SMHLineParser smhParser = null;
        SMLLineParser smlParser = null;
        SFHLineParser sfhParser = null;
        SMFLineParser smfParser = null;
        SEHLineParser sehParser = null;
        SMELineParser smeParser = null;

        SortedMap<Integer, Comment> commentMap = new TreeMap<>();
        SortedMap<Integer, SmallMoleculeSummary> smallMoleculeSummaryMap = new TreeMap<>();
        SortedMap<Integer, SmallMoleculeFeature> smallMoleculeFeatureMap = new TreeMap<>();
        SortedMap<Integer, SmallMoleculeEvidence> smallMoleculeEvidenceMap = new TreeMap<>();

        PositionMapping smlPositionMapping = null;
        PositionMapping smfPositionMapping = null;
        PositionMapping smePositionMapping = null;

        String line;
        int highWaterMark = 1;
        int lineNumber = 0;
        Section section;
        try (BufferedReader reader = readFile(tabFile)) {
            while ((line = reader.readLine()) != null) {
                try {
                    lineNumber++;

                    if (MZTabStringUtils.isEmpty(line)) {
                        continue;
                    }

                    if (line.startsWith(Section.Comment.getPrefix())) {
                        comParser.parse(lineNumber, line, errorList);
                        commentMap.put(lineNumber, comParser.getComment());
                        continue;
                    }

                    section = getSection(line);
                    if (section == null) {
                        MZTabError sectionNullError = new MZTabError(
                            FormatErrorType.LinePrefix, lineNumber,
                            subString(line));
                        throw new MZTabException(sectionNullError);
                    }
                    if (section.getLevel() < highWaterMark) {
                        Section currentSection = Section.findSection(
                            highWaterMark);
                        MZTabError sectionLineOrderError = new MZTabError(
                            LogicalErrorType.LineOrder, lineNumber,
                            currentSection.getName(), section.getName());
                        throw new MZTabException(sectionLineOrderError);
                    }

                    highWaterMark = section.getLevel();

                    switch (highWaterMark) {
                        case 1:
                            // metadata section.
                            mtdParser.parse(lineNumber, line, errorList);
                            break;
                        case 8:
                            if (smhParser != null) {
                                MZTabError error = new MZTabError(
                                    LogicalErrorType.HeaderLine,
                                    lineNumber, subString(line));
                                // header line only display once!
                                throw new MZTabException(error);
                            }

                            // small molecule header section
                            smhParser = new SMHLineParser(context, mtdParser.
                                getMetadata());
                            smhParser.parse(lineNumber, line, errorList);
                            smlPositionMapping = new PositionMapping(smhParser.
                                getFactory(), line);

                            // tell system to continue check small molecule data line.
                            highWaterMark = 9;
                            break;
                        case 9:
                            if (smhParser == null) {
                                // header line should be check first.
                                throw new MZTabException(new MZTabError(
                                    LogicalErrorType.NoHeaderLine,
                                    lineNumber, subString(line)));
                            }

                            if (smlParser == null) {
                                smlParser = new SMLLineParser(context,
                                    smhParser.
                                        getFactory(),
                                    smlPositionMapping, mtdParser.getMetadata(),
                                    errorList);
                            }
                            smlParser.parse(lineNumber, line, errorList);
                            smallMoleculeSummaryMap.put(lineNumber, smlParser.
                                getRecord());

                            break;
                        case 10:
                            if (sfhParser != null) {
                                // header line only display once!
                                throw new MZTabException(new MZTabError(
                                    LogicalErrorType.HeaderLine,
                                    lineNumber, subString(line)));
                            }

                            // small molecule header section
                            sfhParser = new SFHLineParser(context, mtdParser.
                                getMetadata());
                            sfhParser.parse(lineNumber, line, errorList);
                            smfPositionMapping = new PositionMapping(sfhParser.
                                getFactory(), line);

                            // tell system to continue check small molecule data line.
                            highWaterMark = 11;
                            break;
                        case 11:
                            if (sfhParser == null) {
                                // header line should be check first.
                                throw new MZTabException(new MZTabError(
                                    LogicalErrorType.NoHeaderLine,
                                    lineNumber, subString(line)));
                            }

                            if (smfParser == null) {
                                smfParser = new SMFLineParser(context,
                                    sfhParser.
                                        getFactory(),
                                    smfPositionMapping, mtdParser.getMetadata(),
                                    errorList);
                            }
                            smfParser.parse(lineNumber, line, errorList);
                            smallMoleculeFeatureMap.put(lineNumber, smfParser.
                                getRecord());

                            break;
                        case 12:
                            if (sehParser != null) {
                                // header line only display once!
                                throw new MZTabException(new MZTabError(
                                    LogicalErrorType.HeaderLine,
                                    lineNumber, subString(line)));
                            }

                            // small molecule header section
                            sehParser = new SEHLineParser(context, mtdParser.
                                getMetadata());
                            sehParser.parse(lineNumber, line, errorList);
                            smePositionMapping = new PositionMapping(sehParser.
                                getFactory(), line);

                            // tell system to continue check small molecule data line.
                            highWaterMark = 13;
                            break;
                        case 13:
                            if (sehParser == null) {
                                // header line should be check first.
                                throw new MZTabException(new MZTabError(
                                    LogicalErrorType.NoHeaderLine,
                                    lineNumber, subString(line)));
                            }

                            if (smeParser == null) {
                                smeParser = new SMELineParser(context,
                                    sehParser.
                                        getFactory(),
                                    smePositionMapping, mtdParser.getMetadata(),
                                    errorList);
                            }
                            smeParser.parse(lineNumber, line, errorList);
                            smallMoleculeEvidenceMap.put(lineNumber, smeParser.
                                getRecord());

                            break;
                        default:
                            throw new IllegalArgumentException(
                                "Unknown section level " + highWaterMark);
                    }
                } catch (NullPointerException npe) {
                    throw new MZTabException(new MZTabError(
                        LogicalErrorType.NULL,
                        lineNumber, subString(line)), npe);
                }
            }

        }

        mtdParser.refineNormalMetadata();

        if (errorList.isEmpty()) {
            mzTabFile = new MzTab();
            mzTabFile.metadata(mtdParser.getMetadata());
            for (Integer id : commentMap.keySet()) {
                mzTabFile.addCommentItem(commentMap.get(id));
            }

            if (smallMoleculeSummaryMap.isEmpty()) {
                errorList.add(new MZTabError(
                    LogicalErrorType.NoSmallMoleculeSummarySection, -1));
            }
            if (smlParser != null) {

                for (Integer id : smallMoleculeSummaryMap.keySet()) {
                    mzTabFile.addSmallMoleculeSummaryItem(
                        smallMoleculeSummaryMap.get(
                            id));
                }
                //check that reliability values are correct
                if (mzTabFile.getMetadata().
                    getSmallMoleculeIdentificationReliability() == null) {
                    Pattern p = Pattern.compile(REGEX_DEFAULT_RELIABILITY);
                    for (SmallMoleculeSummary smi : mzTabFile.
                        getSmallMoleculeSummary()) {
                        String reliability = smi.getReliability();
                        Matcher matcher = p.matcher(reliability);
                        if (!matcher.matches()) {
                            errorList.add(new MZTabError(
                                FormatErrorType.RegexMismatch, -1,
                                SmallMoleculeSummary.Properties.reliability.
                                    getPropertyName(), reliability,
                                MzTab.Properties.smallMoleculeSummary.
                                    getPropertyName(), "" + smi.getSmlId(),
                                REGEX_DEFAULT_RELIABILITY));
                        }
                    }
                }
                checkColunitMapping(smhParser.getFactory(), Optional.ofNullable(
                    mzTabFile.
                        getMetadata().
                        getColunitSmallMolecule()),
                    Metadata.Properties.colunitSmallMolecule,
                    MzTab.Properties.smallMoleculeSummary);
            }

            if (smallMoleculeFeatureMap.isEmpty() && !smallMoleculeSummaryMap.
                isEmpty()) {
                errorList.add(new MZTabError(
                    LogicalErrorType.NoSmallMoleculeFeatureSection, -1));
            }
            if (smfParser != null) {
                for (Integer id : smallMoleculeFeatureMap.keySet()) {
                    SmallMoleculeFeature smf
                        = smallMoleculeFeatureMap.get(
                            id);
                    mzTabFile.addSmallMoleculeFeatureItem(smf);
                }
                if (smallMoleculeFeatureMap.size() > 0 && mzTabFile.
                    getMetadata().
                    getSmallMoleculeFeatureQuantificationUnit() == null) {
                    errorList.add(new MZTabError(
                        LogicalErrorType.NoSmallMoleculeFeatureQuantificationUnit,
                        -1));
                }
                checkColunitMapping(sfhParser.getFactory(), Optional.ofNullable(
                    mzTabFile.
                        getMetadata().
                        getColunitSmallMoleculeFeature()),
                    Metadata.Properties.colunitSmallMoleculeFeature,
                    MzTab.Properties.smallMoleculeFeature);
            }
            if (smallMoleculeEvidenceMap.isEmpty() && !smallMoleculeSummaryMap.
                isEmpty()) {
                errorList.add(new MZTabError(
                    LogicalErrorType.NoSmallMoleculeEvidenceSection, -1));
            }
            if (smeParser != null) {
                for (Integer id : smallMoleculeEvidenceMap.keySet()) {
                    mzTabFile.addSmallMoleculeEvidenceItem(
                        smallMoleculeEvidenceMap.get(
                            id));
                }
                checkColunitMapping(sehParser.getFactory(), Optional.ofNullable(
                    mzTabFile.
                        getMetadata().
                        getColunitSmallMoleculeEvidence()),
                    Metadata.Properties.colunitSmallMoleculeEvidence,
                    MzTab.Properties.smallMoleculeEvidence
                );
            }
            //check ID refs, starting at SML level
            if (smlParser != null && smfParser != null) {
                for (Integer id : smallMoleculeSummaryMap.keySet()) {
                    SmallMoleculeSummary sms = smallMoleculeSummaryMap.get(id);
                    Set<Integer> smfIdRefs = new HashSet<>(sms.getSmfIdRefs());
                    Set<Integer> definedIds = smallMoleculeFeatureMap.values().
                        stream().
                        map((t) ->
                        {
                            return t.getSmfId();
                        }).
                        collect(Collectors.toSet());
                    smfIdRefs.removeAll(definedIds);
                    if (!smfIdRefs.isEmpty()) {
                        for (Integer smfRefId : smfIdRefs) {
                            //raise a warning about unmatched SMF id
                            //Reference id "{0}" for column "{1}" from element "{2}" in section "{3}" to section "{4}" must have a matching element defined.
                            errorList.add(new MZTabError(
                                LogicalErrorType.UnknownRefId, -1, "" + smfRefId,
                                SmallMoleculeSummary.Properties.smfIdRefs.
                                    getPropertyName(), "" + sms.getSmlId(),
                                MzTab.Properties.smallMoleculeSummary.
                                    getPropertyName(),
                                MzTab.Properties.smallMoleculeFeature.
                                    getPropertyName()));
                        }
                    }
                }
                if (smeParser != null) {
                    for (Integer id : smallMoleculeFeatureMap.keySet()) {
                        SmallMoleculeFeature smf = smallMoleculeFeatureMap.get(
                            id);
                        Set<Integer> smeIdRefs = new HashSet<>(smf.
                            getSmeIdRefs());
                        Set<Integer> definedIds = smallMoleculeEvidenceMap.
                            values().
                            stream().
                            map((t) ->
                            {
                                return t.getSmeId();
                            }).
                            collect(Collectors.toSet());
                        smeIdRefs.removeAll(definedIds);
                        if (!smeIdRefs.isEmpty()) {
                            for (Integer smeRefId : smeIdRefs) {
                                //raise a warning about unmatched SMF id
                                //Reference id "{0}" for column "{1}" from element "{2}" in section "{3}" to section "{4}" must have a matching element defined.
                                errorList.add(new MZTabError(
                                    LogicalErrorType.UnknownRefId, -1,
                                    "" + smeRefId,
                                    SmallMoleculeFeature.Properties.smeIdRefs.
                                        getPropertyName(), "" + smf.getSmfId(),
                                    MzTab.Properties.smallMoleculeFeature.
                                        getPropertyName(),
                                    MzTab.Properties.smallMoleculeEvidence.
                                        getPropertyName()));
                            }
                        }
                    }
                }
            }
        }

    }

    protected void checkColunitMapping(MZTabColumnFactory columnFactory,
        Optional<Collection<ColumnParameterMapping>> columnParameterMapping,
        Metadata.Properties colUnitProperty, MzTab.Properties mzTabSection) {
        columnParameterMapping.orElse(Collections.emptyList()).
            forEach((colUnit) ->
            {
                String columnName = colUnit.getColumnName();
                IMZTabColumn column = columnFactory.findColumnByHeader(
                    columnName);
                if (column == null) {
                    errorList.add(new MZTabError(
                        FormatErrorType.ColUnit, -1,
                        colUnitProperty.
                            getPropertyName(), columnName,
                        mzTabSection.
                            getPropertyName()));
                }
            });
    }

    /**
     * <p>
     * getMZTabFile.</p>
     *
     * @return a {@link de.isas.mztab2.model.MzTab} object.
     */
    public MzTab getMZTabFile() {
        return mzTabFile;
    }
}