Source code

001/* 
002 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V..
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package uk.ac.ebi.pride.jmztab2.utils.parser;
017
018import de.isas.mztab2.model.Assay;
019import de.isas.mztab2.model.Metadata;
020import de.isas.mztab2.model.MsRun;
021import de.isas.mztab2.model.Parameter;
022import de.isas.mztab2.model.SmallMoleculeSummary;
023import de.isas.mztab2.model.StudyVariable;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026import uk.ac.ebi.pride.jmztab2.model.MZBoolean;
027import uk.ac.ebi.pride.jmztab2.model.MZTabColumn;
028import uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory;
029import uk.ac.ebi.pride.jmztab2.model.MZTabConstants;
030import uk.ac.ebi.pride.jmztab2.model.Section;
031import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType;
032import uk.ac.ebi.pride.jmztab2.utils.errors.LogicalErrorType;
033import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError;
034import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList;
035import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException;
036
037
038/**
039 * A couple of common method used to parse a header line into {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} structure.
040 * 
041 * NOTICE: {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} maintain a couple of {@link MZTabColumn} which have internal logical
042 * position and order. In physical mzTab file, we allow user not obey this logical position organized way,
043 * and provide their date with own order. In order to distinguish them, we use physical position (a positive
044 * integer) to record the column location in mzTab file. And use {@link uk.ac.ebi.pride.jmztab2.utils.parser.PositionMapping} structure the maintain
045 * the mapping between them.
046 *
047 * @author qingwei
048 * @see SMHLineParser
049 * @see SMFLineParser
050 * @see SMELineParser
051 * @since 11/02/13
052 * 
053 */
054public abstract class MZTabHeaderLineParser extends MZTabLineParser {
055
056    protected MZTabColumnFactory factory;
057    protected Metadata metadata;
058
059    /**
060     * Parse a header line into {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} structure.
061     *
062     * @param context the parser context, keeping dynamic state and lookup associations.
063     * @param factory  SHOULD NOT set null
064     * @param metadata SHOULD NOT set null
065     */
066    protected MZTabHeaderLineParser(MZTabParserContext context, MZTabColumnFactory factory, Metadata metadata) {
067        super(context);
068        if (factory == null) {
069            throw new NullPointerException("Header line should be parsed first!");
070        }
071        this.factory = factory;
072
073        if (metadata == null) {
074            throw new NullPointerException("Metadata should be created first!");
075        }
076        this.metadata = metadata;
077    }
078
079    /**
080     * {@inheritDoc}
081     *
082     * Parse a header line into {@link MZTabColumnFactory} structure. There are several steps in this method:
083     * Step 1: {@link #parseColumns()} focus on validate and parse all columns. 
084     * Step 2: {@link #refine()}
085     */
086    @Override
087    public void parse(int lineNumber, String line, MZTabErrorList errorList) throws MZTabException {
088        super.parse(lineNumber, line, errorList);
089
090        int offset = parseColumns();
091        if (offset != items.length) {
092            this.errorList.add(new MZTabError(LogicalErrorType.HeaderLine, lineNumber, section.getName(), "" + offset, "" + items.length));
093        }
094
095        refine();
096
097    }
098
099    /**
100     * This methods delegates to the subclasses the parsing of the columns. All of the columns are defined in 
101     * {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeColumn}, {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeFeatureColumn}, or {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeEvidenceColumn}.
102     *
103     * @return the next physical index of column available after the parsing.
104     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
105     */
106    protected abstract int parseColumns() throws MZTabException;
107
108
109    /**
110     * Some validate operation need to be done after the whole {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} created.
111     * Thus, user can add them, and called at the end of the
112     * {@link #parse(int, String, MZTabErrorList)} method.
113     *
114     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
115     */
116    protected abstract void refine() throws MZTabException;
117
118
119    /**
120     * Refine optional columns and check, whether they were properly defined.
121     * These re-validate operation will called in {@link #refine()} method.
122     *
123     * @param section a {@link Section} object defining the part of the document.
124     * @param columnHeader a {@link java.lang.String} object.
125     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
126     */
127    protected void refineOptionalColumn(Section section, String columnHeader) throws MZTabException {
128        if (factory.findColumnByHeader(columnHeader) == null) {
129            throw new MZTabException(new MZTabError(LogicalErrorType.NotDefineInHeader, lineNumber, columnHeader, section.getName()));
130        }
131    }
132
133    /**
134     * <p>fromIndexToOrder.</p>
135     *
136     * @param index a {@link java.lang.Integer} object.
137     * @return a {@link java.lang.String} object.
138     */
139    protected String fromIndexToOrder(Integer index) {
140        return String.format("%02d", index);
141    }
142
143    /**
144     * Additional columns can be added to the end of the protein table. These column headers MUST start with the prefix "opt_".
145     * Column names MUST only contain the following characters: 'A'-'Z', 'a'-'z', '0'-'9', '_', '-', '[', ']', and ':'.
146     * 
147     * the format: opt_{IndexedElement[id]}_{value}. Spaces within the parameter's name MUST be replaced by '_'.
148     *
149     * @param nameLabel a {@link java.lang.String} object.
150     * @return a boolean.
151     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
152     */
153    protected boolean checkOptColumnName(String nameLabel) throws MZTabException {
154        nameLabel = nameLabel.trim();
155
156        String regexp = MZTabConstants.REGEX_OPT_COLUMN_NAME;
157        Pattern pattern = Pattern.compile(regexp);
158        Matcher matcher = pattern.matcher(nameLabel);
159
160        Integer id;
161        String object_id;
162        String value;
163        MZTabError error;
164        if (matcher.find()) {
165            object_id = matcher.group(1);
166            value = matcher.group(4);
167
168            Parameter param = null;
169            if (value.startsWith(MZTabConstants.CV_PREFIX)) {
170                param = checkCVParamOptColumnName(nameLabel, value);
171            }
172
173            Class dataType = getDataType(param);
174
175            if (object_id.contains(MZTabConstants.GLOBAL)) {
176                if (param == null) {
177                    factory.addOptionalColumn(value, dataType);
178                } else {
179                    factory.addOptionalColumn(param, dataType);
180                }
181            } else {
182                id = parseIndex(nameLabel, matcher.group(3));
183
184                if (object_id.contains(Metadata.Properties.assay.getPropertyName())) {
185                    Assay element = context.getAssayMap().get(id);
186                    // not found assay_id in metadata.
187                    if (element == null) {
188                        error = new MZTabError(LogicalErrorType.AssayNotDefined, lineNumber, nameLabel);
189                        throw new MZTabException(error);
190                    } else if (param == null) {
191                        factory.addOptionalColumn(element, value, dataType);
192                    } else {
193                        factory.addOptionalColumn(element, param, dataType);
194                    }
195                } else if (object_id.contains(Metadata.Properties.studyVariable.getPropertyName())) {
196                    StudyVariable element = context.getStudyVariableMap().get(id);
197                    // not found study_variable_id in metadata.
198                    if (element == null) {
199                        error = new MZTabError(LogicalErrorType.StudyVariableNotDefined, lineNumber, nameLabel);
200                        throw new MZTabException(error);
201                    } else if (param == null) {
202                        factory.addOptionalColumn(element, value, dataType);
203                    } else {
204                        factory.addOptionalColumn(element, param, dataType);
205                    }
206                } else if (object_id.contains(Metadata.Properties.msRun.getPropertyName())) {
207                    // not found ms_run_id in metadata.
208                    MsRun element = context.getMsRunMap().get(id);
209                    if (element == null) {
210                        error = new MZTabError(LogicalErrorType.MsRunNotDefined, lineNumber, nameLabel);
211                        throw new MZTabException(error);
212                    } else if (param == null) {
213                        factory.addOptionalColumn(element, value, dataType);
214                    } else {
215                        factory.addOptionalColumn(element, param, dataType);
216                    }
217                }
218            }
219
220            return true;
221        } else {
222            throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel));
223        }
224    }
225
226    /**
227     * An kind of {@link CVParamOptionColumn} which use CV parameter accessions in following the format:
228     * opt_{OBJECT_ID}_cv_{accession}_{parameter name}. Spaces within the parameter' s name MUST be replaced by '_'.
229     */
230    private Parameter checkCVParamOptColumnName(String nameLabel, String valueLabel) throws MZTabException {
231        nameLabel = nameLabel.trim();
232        valueLabel = valueLabel.trim();
233
234        String regexp = MZTabConstants.REGEX_CV_PARAM_OPT_COLUMN_NAME;
235        Pattern pattern = Pattern.compile(regexp);
236        Matcher matcher = pattern.matcher(valueLabel);
237
238        Parameter param;
239        if (!matcher.find() || matcher.end() != valueLabel.length()) {
240            throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel));
241        } else {
242            String accession = matcher.group(2);
243            String name = matcher.group(4);
244            if (name == null || name.trim().length() == 0) {
245                throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel));
246            }
247
248            param = matcher.group(4) == null ? null : new Parameter().cvAccession(accession).name(name);
249        }
250
251        return param;
252    }
253
254    /**
255     * Some {@link CVParamOptionColumn}, their data type have defined. Currently, we provide two {@link Parameter}
256     * which defined in the mzTab specification. One is "emPAI value" (MS:1001905), data type is Double;
257     * another is "decoy peptide" (MS:1002217), the data type is Boolean (0/1). Besides them, "opt_" start optional
258     * column data type is String.
259     *
260     * @see #checkOptColumnName(String)
261     */
262    private Class getDataType(Parameter param) {
263        Class dataType;
264
265        if (param == null) {
266            dataType = String.class;
267        } else if (param.getCvAccession().equals("MS:1001905")) {
268            dataType = Double.class;
269        } else if (param.getCvAccession().equals("MS:1002217")) {
270            dataType = MZBoolean.class;
271        } else if (param.getCvAccession().equals("PRIDE:0000303")) {
272            dataType = MZBoolean.class;
273        } else {
274            dataType = String.class;
275        }
276
277        return dataType;
278    }
279
280    /**
281     * <p>checkAbundanceColumns.</p>
282     *
283     * @param offset a int.
284     * @param order a {@link java.lang.String} object.
285     * @return a int.
286     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
287     */
288    protected int checkAbundanceColumns(int offset, String order) throws MZTabException {
289        String headerString = items[offset];
290        if (headerString.contains(SmallMoleculeSummary.Properties.abundanceAssay.getPropertyName())) {
291            checkAbundanceAssayColumn(headerString, order);
292            return offset;
293        } else if (headerString.contains(SmallMoleculeSummary.Properties.abundanceStudyVariable.getPropertyName()) || headerString.contains(SmallMoleculeSummary.Properties.abundanceVariationStudyVariable.getPropertyName())) {
294            checkAbundanceStudyVariableColumns(headerString, order);
295            return offset;
296        } else {
297            MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, headerString);
298            throw new MZTabException(error);
299        }
300    }
301
302    /**
303     * Check (protein|peptide|smallmolecule)_abundance is correct, and return object value label.
304     * For example, protein_abundance_std_error_study_variable[id], return study_variable[id].
305     */
306    private String checkAbundanceSection(String abundanceHeader) throws MZTabException {
307        abundanceHeader = abundanceHeader.trim().toLowerCase();
308
309        Pattern pattern = Pattern.compile(MZTabConstants.REGEX_ABUNDANCE_COLUMN_NAME);
310        Matcher matcher = pattern.matcher(abundanceHeader);
311
312        if (matcher.find()) {
313//            String sectionName = matcher.group(1);
314//            if (sectionName != null &&
315//                    !(sectionName.equals(Section.Protein.getName()) && section != Section.Protein_Header) &&
316//                    !(sectionName.equals(Section.Peptide.getName()) && section != Section.Peptide_Header) &&
317//                    !(sectionName.equals(Section.Small_Molecule.getName()) && section != Section.Small_Molecule_Header)) {
318                return matcher.group(1);
319//            }
320
321//            MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
322//            throw new MZTabException(error);
323        } else {
324            MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
325            throw new MZTabException(error);
326        }
327    }
328
329    private void checkAbundanceAssayColumn(String abundanceHeader, String order) throws MZTabException {
330        String valueLabel = checkAbundanceSection(abundanceHeader);
331
332        Pattern pattern = Pattern.compile(MZTabConstants.REGEX_ABUNDANCE_ASSAY_COLUMN_NAME);
333        Matcher matcher = pattern.matcher(valueLabel);
334        if (!matcher.find()) {
335            MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
336            throw new MZTabException(error);
337        }
338
339        int id = parseIndex(abundanceHeader, matcher.group(1));
340        Assay assay = context.getAssayMap().get(id);
341        if (assay == null) {
342            MZTabError error = new MZTabError(LogicalErrorType.AssayNotDefined, lineNumber, abundanceHeader);
343            throw new MZTabException(error);
344        }
345
346        factory.addAbundanceOptionalColumn(assay, order);
347    }
348
349
350    private void checkAbundanceStudyVariableColumns(String header,
351                                                    String order) throws MZTabException {
352        header = header.trim().toLowerCase();
353
354        if (!header.contains(SmallMoleculeSummary.Properties.abundanceStudyVariable.getPropertyName()
355            ) && !header.contains(SmallMoleculeSummary.Properties.abundanceVariationStudyVariable.getPropertyName())) {
356            MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, header);
357            throw new MZTabException(error);
358        } else {
359            StudyVariable abundanceStudyVariable = checkAbundanceStudyVariableColumn(header);
360
361            //adds both abundance_study_variable and abundance_coeffvar_study_variable columns
362            factory.addAbundanceOptionalColumn(abundanceStudyVariable, checkAbundanceSection(header), order);
363
364        }
365    }
366
367    /**
368     * Check XXXX_abundance_study_variable[id], XXXX_abundance_stdev_study_variable[id], XXXX_abundance_std_error_study_variable[id]
369     * column header. If parse error, stop validate and raise {@link MZTabException}.
370     */
371    private StudyVariable checkAbundanceStudyVariableColumn(String abundanceHeader) throws MZTabException {
372        String valueLabel = checkAbundanceSection(abundanceHeader);
373
374        Pattern pattern = Pattern.compile(MZTabConstants.REGEX_STUDY_VARIABLE_COLUMN_NAME);
375        Matcher matcher = pattern.matcher(valueLabel);
376        if (!matcher.find()) {
377            MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader);
378            throw new MZTabException(error);
379        }
380
381        int id = parseIndex(abundanceHeader, matcher.group(1));
382        StudyVariable studyVariable = context.getStudyVariableMap().get(id);
383        if (studyVariable == null) {
384            MZTabError error = new MZTabError(LogicalErrorType.StudyVariableNotDefined, lineNumber, abundanceHeader);
385            throw new MZTabException(error);
386        }
387
388        return studyVariable;
389    }
390
391    /**
392     * Parse header to a index id number.
393     * If exists parse error, stop validate and throw {@link uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException} directly.
394     *
395     * @param header a {@link java.lang.String} object.
396     * @param id a {@link java.lang.String} object.
397     * @return a int.
398     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing.
399     */
400    protected int parseIndex(String header, String id) throws MZTabException {
401        try {
402            Integer index = Integer.parseInt(id);
403            if (index < 1) {
404                throw new NumberFormatException();
405            }
406
407            return index;
408        } catch (NumberFormatException e) {
409            MZTabError error = new MZTabError(LogicalErrorType.IdNumber, lineNumber, header, id);
410            throw new MZTabException(error);
411        }
412    }
413
414    /**
415     * <p>Getter for the field <code>factory</code>.</p>
416     *
417     * @return a {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} object.
418     */
419    public MZTabColumnFactory getFactory() {
420        return factory;
421    }
422}