001/* 002 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V.. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package uk.ac.ebi.pride.jmztab2.utils.parser; 017 018import de.isas.mztab2.model.Assay; 019import de.isas.mztab2.model.Metadata; 020import de.isas.mztab2.model.MsRun; 021import de.isas.mztab2.model.Parameter; 022import de.isas.mztab2.model.SmallMoleculeSummary; 023import de.isas.mztab2.model.StudyVariable; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026import uk.ac.ebi.pride.jmztab2.model.MZBoolean; 027import uk.ac.ebi.pride.jmztab2.model.MZTabColumn; 028import uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory; 029import uk.ac.ebi.pride.jmztab2.model.MZTabConstants; 030import uk.ac.ebi.pride.jmztab2.model.Section; 031import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType; 032import uk.ac.ebi.pride.jmztab2.utils.errors.LogicalErrorType; 033import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError; 034import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabErrorList; 035import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException; 036 037 038/** 039 * A couple of common method used to parse a header line into {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} structure. 040 * 041 * NOTICE: {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} maintain a couple of {@link MZTabColumn} which have internal logical 042 * position and order. In physical mzTab file, we allow user not obey this logical position organized way, 043 * and provide their date with own order. In order to distinguish them, we use physical position (a positive 044 * integer) to record the column location in mzTab file. And use {@link uk.ac.ebi.pride.jmztab2.utils.parser.PositionMapping} structure the maintain 045 * the mapping between them. 046 * 047 * @author qingwei 048 * @see SMHLineParser 049 * @see SMFLineParser 050 * @see SMELineParser 051 * @since 11/02/13 052 * 053 */ 054public abstract class MZTabHeaderLineParser extends MZTabLineParser { 055 056 protected MZTabColumnFactory factory; 057 protected Metadata metadata; 058 059 /** 060 * Parse a header line into {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} structure. 061 * 062 * @param context the parser context, keeping dynamic state and lookup associations. 063 * @param factory SHOULD NOT set null 064 * @param metadata SHOULD NOT set null 065 */ 066 protected MZTabHeaderLineParser(MZTabParserContext context, MZTabColumnFactory factory, Metadata metadata) { 067 super(context); 068 if (factory == null) { 069 throw new NullPointerException("Header line should be parsed first!"); 070 } 071 this.factory = factory; 072 073 if (metadata == null) { 074 throw new NullPointerException("Metadata should be created first!"); 075 } 076 this.metadata = metadata; 077 } 078 079 /** 080 * {@inheritDoc} 081 * 082 * Parse a header line into {@link MZTabColumnFactory} structure. There are several steps in this method: 083 * Step 1: {@link #parseColumns()} focus on validate and parse all columns. 084 * Step 2: {@link #refine()} 085 */ 086 @Override 087 public void parse(int lineNumber, String line, MZTabErrorList errorList) throws MZTabException { 088 super.parse(lineNumber, line, errorList); 089 090 int offset = parseColumns(); 091 if (offset != items.length) { 092 this.errorList.add(new MZTabError(LogicalErrorType.HeaderLine, lineNumber, section.getName(), "" + offset, "" + items.length)); 093 } 094 095 refine(); 096 097 } 098 099 /** 100 * This methods delegates to the subclasses the parsing of the columns. All of the columns are defined in 101 * {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeColumn}, {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeFeatureColumn}, or {@link uk.ac.ebi.pride.jmztab2.model.SmallMoleculeEvidenceColumn}. 102 * 103 * @return the next physical index of column available after the parsing. 104 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing. 105 */ 106 protected abstract int parseColumns() throws MZTabException; 107 108 109 /** 110 * Some validate operation need to be done after the whole {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} created. 111 * Thus, user can add them, and called at the end of the 112 * {@link #parse(int, String, MZTabErrorList)} method. 113 * 114 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing. 115 */ 116 protected abstract void refine() throws MZTabException; 117 118 119 /** 120 * Refine optional columns and check, whether they were properly defined. 121 * These re-validate operation will called in {@link #refine()} method. 122 * 123 * @param section a {@link Section} object defining the part of the document. 124 * @param columnHeader a {@link java.lang.String} object. 125 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing. 126 */ 127 protected void refineOptionalColumn(Section section, String columnHeader) throws MZTabException { 128 if (factory.findColumnByHeader(columnHeader) == null) { 129 throw new MZTabException(new MZTabError(LogicalErrorType.NotDefineInHeader, lineNumber, columnHeader, section.getName())); 130 } 131 } 132 133 /** 134 * <p>fromIndexToOrder.</p> 135 * 136 * @param index a {@link java.lang.Integer} object. 137 * @return a {@link java.lang.String} object. 138 */ 139 protected String fromIndexToOrder(Integer index) { 140 return String.format("%02d", index); 141 } 142 143 /** 144 * Additional columns can be added to the end of the protein table. These column headers MUST start with the prefix "opt_". 145 * Column names MUST only contain the following characters: 'A'-'Z', 'a'-'z', '0'-'9', '_', '-', '[', ']', and ':'. 146 * 147 * the format: opt_{IndexedElement[id]}_{value}. Spaces within the parameter's name MUST be replaced by '_'. 148 * 149 * @param nameLabel a {@link java.lang.String} object. 150 * @return a boolean. 151 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing. 152 */ 153 protected boolean checkOptColumnName(String nameLabel) throws MZTabException { 154 nameLabel = nameLabel.trim(); 155 156 String regexp = MZTabConstants.REGEX_OPT_COLUMN_NAME; 157 Pattern pattern = Pattern.compile(regexp); 158 Matcher matcher = pattern.matcher(nameLabel); 159 160 Integer id; 161 String object_id; 162 String value; 163 MZTabError error; 164 if (matcher.find()) { 165 object_id = matcher.group(1); 166 value = matcher.group(4); 167 168 Parameter param = null; 169 if (value.startsWith(MZTabConstants.CV_PREFIX)) { 170 param = checkCVParamOptColumnName(nameLabel, value); 171 } 172 173 Class dataType = getDataType(param); 174 175 if (object_id.contains(MZTabConstants.GLOBAL)) { 176 if (param == null) { 177 factory.addOptionalColumn(value, dataType); 178 } else { 179 factory.addOptionalColumn(param, dataType); 180 } 181 } else { 182 id = parseIndex(nameLabel, matcher.group(3)); 183 184 if (object_id.contains(Metadata.Properties.assay.getPropertyName())) { 185 Assay element = context.getAssayMap().get(id); 186 // not found assay_id in metadata. 187 if (element == null) { 188 error = new MZTabError(LogicalErrorType.AssayNotDefined, lineNumber, nameLabel); 189 throw new MZTabException(error); 190 } else if (param == null) { 191 factory.addOptionalColumn(element, value, dataType); 192 } else { 193 factory.addOptionalColumn(element, param, dataType); 194 } 195 } else if (object_id.contains(Metadata.Properties.studyVariable.getPropertyName())) { 196 StudyVariable element = context.getStudyVariableMap().get(id); 197 // not found study_variable_id in metadata. 198 if (element == null) { 199 error = new MZTabError(LogicalErrorType.StudyVariableNotDefined, lineNumber, nameLabel); 200 throw new MZTabException(error); 201 } else if (param == null) { 202 factory.addOptionalColumn(element, value, dataType); 203 } else { 204 factory.addOptionalColumn(element, param, dataType); 205 } 206 } else if (object_id.contains(Metadata.Properties.msRun.getPropertyName())) { 207 // not found ms_run_id in metadata. 208 MsRun element = context.getMsRunMap().get(id); 209 if (element == null) { 210 error = new MZTabError(LogicalErrorType.MsRunNotDefined, lineNumber, nameLabel); 211 throw new MZTabException(error); 212 } else if (param == null) { 213 factory.addOptionalColumn(element, value, dataType); 214 } else { 215 factory.addOptionalColumn(element, param, dataType); 216 } 217 } 218 } 219 220 return true; 221 } else { 222 throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel)); 223 } 224 } 225 226 /** 227 * An kind of {@link CVParamOptionColumn} which use CV parameter accessions in following the format: 228 * opt_{OBJECT_ID}_cv_{accession}_{parameter name}. Spaces within the parameter' s name MUST be replaced by '_'. 229 */ 230 private Parameter checkCVParamOptColumnName(String nameLabel, String valueLabel) throws MZTabException { 231 nameLabel = nameLabel.trim(); 232 valueLabel = valueLabel.trim(); 233 234 String regexp = MZTabConstants.REGEX_CV_PARAM_OPT_COLUMN_NAME; 235 Pattern pattern = Pattern.compile(regexp); 236 Matcher matcher = pattern.matcher(valueLabel); 237 238 Parameter param; 239 if (!matcher.find() || matcher.end() != valueLabel.length()) { 240 throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel)); 241 } else { 242 String accession = matcher.group(2); 243 String name = matcher.group(4); 244 if (name == null || name.trim().length() == 0) { 245 throw new MZTabException(new MZTabError(FormatErrorType.OptionalCVParamColumn, lineNumber, nameLabel)); 246 } 247 248 param = matcher.group(4) == null ? null : new Parameter().cvAccession(accession).name(name); 249 } 250 251 return param; 252 } 253 254 /** 255 * Some {@link CVParamOptionColumn}, their data type have defined. Currently, we provide two {@link Parameter} 256 * which defined in the mzTab specification. One is "emPAI value" (MS:1001905), data type is Double; 257 * another is "decoy peptide" (MS:1002217), the data type is Boolean (0/1). Besides them, "opt_" start optional 258 * column data type is String. 259 * 260 * @see #checkOptColumnName(String) 261 */ 262 private Class getDataType(Parameter param) { 263 Class dataType; 264 265 if (param == null) { 266 dataType = String.class; 267 } else if (param.getCvAccession().equals("MS:1001905")) { 268 dataType = Double.class; 269 } else if (param.getCvAccession().equals("MS:1002217")) { 270 dataType = MZBoolean.class; 271 } else if (param.getCvAccession().equals("PRIDE:0000303")) { 272 dataType = MZBoolean.class; 273 } else { 274 dataType = String.class; 275 } 276 277 return dataType; 278 } 279 280 /** 281 * <p>checkAbundanceColumns.</p> 282 * 283 * @param offset a int. 284 * @param order a {@link java.lang.String} object. 285 * @return a int. 286 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing. 287 */ 288 protected int checkAbundanceColumns(int offset, String order) throws MZTabException { 289 String headerString = items[offset]; 290 if (headerString.contains(SmallMoleculeSummary.Properties.abundanceAssay.getPropertyName())) { 291 checkAbundanceAssayColumn(headerString, order); 292 return offset; 293 } else if (headerString.contains(SmallMoleculeSummary.Properties.abundanceStudyVariable.getPropertyName()) || headerString.contains(SmallMoleculeSummary.Properties.abundanceVariationStudyVariable.getPropertyName())) { 294 checkAbundanceStudyVariableColumns(headerString, order); 295 return offset; 296 } else { 297 MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, headerString); 298 throw new MZTabException(error); 299 } 300 } 301 302 /** 303 * Check (protein|peptide|smallmolecule)_abundance is correct, and return object value label. 304 * For example, protein_abundance_std_error_study_variable[id], return study_variable[id]. 305 */ 306 private String checkAbundanceSection(String abundanceHeader) throws MZTabException { 307 abundanceHeader = abundanceHeader.trim().toLowerCase(); 308 309 Pattern pattern = Pattern.compile(MZTabConstants.REGEX_ABUNDANCE_COLUMN_NAME); 310 Matcher matcher = pattern.matcher(abundanceHeader); 311 312 if (matcher.find()) { 313// String sectionName = matcher.group(1); 314// if (sectionName != null && 315// !(sectionName.equals(Section.Protein.getName()) && section != Section.Protein_Header) && 316// !(sectionName.equals(Section.Peptide.getName()) && section != Section.Peptide_Header) && 317// !(sectionName.equals(Section.Small_Molecule.getName()) && section != Section.Small_Molecule_Header)) { 318 return matcher.group(1); 319// } 320 321// MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader); 322// throw new MZTabException(error); 323 } else { 324 MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader); 325 throw new MZTabException(error); 326 } 327 } 328 329 private void checkAbundanceAssayColumn(String abundanceHeader, String order) throws MZTabException { 330 String valueLabel = checkAbundanceSection(abundanceHeader); 331 332 Pattern pattern = Pattern.compile(MZTabConstants.REGEX_ABUNDANCE_ASSAY_COLUMN_NAME); 333 Matcher matcher = pattern.matcher(valueLabel); 334 if (!matcher.find()) { 335 MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader); 336 throw new MZTabException(error); 337 } 338 339 int id = parseIndex(abundanceHeader, matcher.group(1)); 340 Assay assay = context.getAssayMap().get(id); 341 if (assay == null) { 342 MZTabError error = new MZTabError(LogicalErrorType.AssayNotDefined, lineNumber, abundanceHeader); 343 throw new MZTabException(error); 344 } 345 346 factory.addAbundanceOptionalColumn(assay, order); 347 } 348 349 350 private void checkAbundanceStudyVariableColumns(String header, 351 String order) throws MZTabException { 352 header = header.trim().toLowerCase(); 353 354 if (!header.contains(SmallMoleculeSummary.Properties.abundanceStudyVariable.getPropertyName() 355 ) && !header.contains(SmallMoleculeSummary.Properties.abundanceVariationStudyVariable.getPropertyName())) { 356 MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, header); 357 throw new MZTabException(error); 358 } else { 359 StudyVariable abundanceStudyVariable = checkAbundanceStudyVariableColumn(header); 360 361 //adds both abundance_study_variable and abundance_coeffvar_study_variable columns 362 factory.addAbundanceOptionalColumn(abundanceStudyVariable, checkAbundanceSection(header), order); 363 364 } 365 } 366 367 /** 368 * Check XXXX_abundance_study_variable[id], XXXX_abundance_stdev_study_variable[id], XXXX_abundance_std_error_study_variable[id] 369 * column header. If parse error, stop validate and raise {@link MZTabException}. 370 */ 371 private StudyVariable checkAbundanceStudyVariableColumn(String abundanceHeader) throws MZTabException { 372 String valueLabel = checkAbundanceSection(abundanceHeader); 373 374 Pattern pattern = Pattern.compile(MZTabConstants.REGEX_STUDY_VARIABLE_COLUMN_NAME); 375 Matcher matcher = pattern.matcher(valueLabel); 376 if (!matcher.find()) { 377 MZTabError error = new MZTabError(FormatErrorType.AbundanceColumn, lineNumber, abundanceHeader); 378 throw new MZTabException(error); 379 } 380 381 int id = parseIndex(abundanceHeader, matcher.group(1)); 382 StudyVariable studyVariable = context.getStudyVariableMap().get(id); 383 if (studyVariable == null) { 384 MZTabError error = new MZTabError(LogicalErrorType.StudyVariableNotDefined, lineNumber, abundanceHeader); 385 throw new MZTabException(error); 386 } 387 388 return studyVariable; 389 } 390 391 /** 392 * Parse header to a index id number. 393 * If exists parse error, stop validate and throw {@link uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException} directly. 394 * 395 * @param header a {@link java.lang.String} object. 396 * @param id a {@link java.lang.String} object. 397 * @return a int. 398 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException if any structural or logical errors are encountered that prohibit further processing. 399 */ 400 protected int parseIndex(String header, String id) throws MZTabException { 401 try { 402 Integer index = Integer.parseInt(id); 403 if (index < 1) { 404 throw new NumberFormatException(); 405 } 406 407 return index; 408 } catch (NumberFormatException e) { 409 MZTabError error = new MZTabError(LogicalErrorType.IdNumber, lineNumber, header, id); 410 throw new MZTabException(error); 411 } 412 } 413 414 /** 415 * <p>Getter for the field <code>factory</code>.</p> 416 * 417 * @return a {@link uk.ac.ebi.pride.jmztab2.model.MZTabColumnFactory} object. 418 */ 419 public MZTabColumnFactory getFactory() { 420 return factory; 421 } 422}