001/* 002 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V.. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package uk.ac.ebi.pride.jmztab2.model; 017 018import de.isas.mztab2.model.IndexedElement; 019import de.isas.mztab2.model.Metadata; 020import de.isas.mztab2.model.MsRun; 021import de.isas.mztab2.model.Parameter; 022import de.isas.mztab2.model.Publication; 023import de.isas.mztab2.model.PublicationItem; 024import de.isas.mztab2.model.SpectraRef; 025import java.net.URI; 026import java.net.URISyntaxException; 027import java.util.ArrayList; 028import java.util.Collections; 029import java.util.List; 030import java.util.regex.Matcher; 031import java.util.regex.Pattern; 032import java.util.stream.Collectors; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.*; 036import static uk.ac.ebi.pride.jmztab2.model.MZTabStringUtils.*; 037import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType; 038import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError; 039import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException; 040import uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext; 041 042/** 043 * Provide a couple of functions for translating, parsing and printing formatted strings 044 * defined in the mzTab specification. 045 * 046 * @author qingwei 047 * @author nilshoffmann 048 * @since 30/01/13 049 * 050 */ 051public class MZTabUtils { 052 053 private static final Logger LOGGER = LoggerFactory.getLogger( 054 MZTabUtils.class); 055 056 /** 057 * If ratios are included and the denominator is zero, the "INF" value MUST 058 * be used. If the result leads to calculation errors (for example 0/0), 059 * this MUST be reported as "not a number" ("NaN"). 060 * 061 * @see #parseDouble(String) 062 * @param value a {@link java.lang.Double} object. 063 * @return a {@link java.lang.String} object. 064 */ 065 public static String printDouble(Double value) { 066 if (value == null) { 067 return NULL; 068 } else if (value.equals(Double.NaN)) { 069 return CALCULATE_ERROR; 070 } else if (value.equals(Double.POSITIVE_INFINITY)) { 071 return INFINITY; 072 } else { 073 return value.toString(); 074 } 075 } 076 077 /** 078 * Parse the target string, and check is obey the email format or not. If 079 * not, return null. 080 * 081 * @param target a {@link java.lang.String} object. 082 * @return a {@link java.lang.String} object. 083 */ 084 public static String parseEmail(String target) { 085 target = parseString(target); 086 if (target == null) { 087 return null; 088 } 089 090 String regexp = REGEX_EMAIL; 091 Pattern pattern = Pattern.compile(regexp); 092 Matcher matcher = pattern.matcher(target); 093 094 return matcher.find() ? target : null; 095 } 096 097 /** 098 * Parse the target string, and check it follows the mzTab Version format. 099 * If not, return null. 100 * 101 * @param target a {@link java.lang.String} object. 102 * @return a {@link java.lang.String} object. 103 */ 104 public static String parseMzTabVersion(String target) { 105 target = parseString(target); 106 if (target == null) { 107 return null; 108 } 109 110 Pattern versionPattern = Pattern.compile(MZTabConstants.REGEX_MZTAB_M); 111 Matcher m = versionPattern.matcher(target); 112 if (m.matches()) { 113 Integer major = Integer.parseInt(m.group("major")); 114 Integer minor = Integer.parseInt(m.group("minor")); 115 Integer micro = Integer.parseInt(m.group("micro")); 116 if (major != 2) { 117 return null; 118 } 119 if (!"M".equals(m.group("profile"))) { 120 return null; 121 } 122 return target; 123 } 124 return null; 125 } 126 127 /** 128 * Parameters are always reported as [CV label, accession, name, value]. Any 129 * field that is not available MUST be left empty. 130 * 131 * If the name or value of param contains comma, quotes MUST be added to 132 * avoid problems. Nested double quotes are not supported. 133 * 134 * Notice: name cell never set null. 135 * 136 * @param target a {@link java.lang.String} object. 137 * @return a {@link de.isas.mztab2.model.Parameter} object. 138 */ 139 public static Parameter parseParam(String target) { 140 target = parseString(target); 141 if (target == null) { 142 return null; 143 } 144 145 try { 146 target = target.substring(target.indexOf("[") + 1, target. 147 lastIndexOf("]")); 148 String[] tokens = target.split(REGEX_PARAM_SPLIT, -1); 149 150 if (tokens.length == 4) { 151 String cvLabel = tokens[0].trim(); 152 153 String accession = tokens[1].trim(); 154 155 String name = tokens[2].trim(); 156 if (name.contains("\"")) { //We remove the escaping because it will be written back in the writer 157 name = removeDoubleQuotes(name); 158 } 159 160 if (isEmpty(name)) { 161 return null; 162 } 163 164 String value = tokens[3].trim(); 165 if (value.contains("\"")) { //We remove the escaping because it will be written back in the writer 166 value = removeDoubleQuotes(value); 167 } 168 if (isEmpty(value)) { 169 value = null; 170 } 171 172 if (isEmpty(cvLabel) && isEmpty(accession)) { 173 return new Parameter().name(name). 174 value(value); 175 } else { 176 return new Parameter().cvLabel(cvLabel). 177 cvAccession(accession). 178 name(name). 179 value(value); 180 } 181 } 182 } catch (IndexOutOfBoundsException e) { 183 return null; 184 } 185 186 return null; 187 188 } 189 190 /** 191 * Multiple identifiers MUST be separated by splitChar. 192 * 193 * @param splitChar a char. 194 * @param target a {@link java.lang.String} object. 195 * @return a {@link java.util.List} object. 196 */ 197 public static List<String> parseStringList(char splitChar, String target) { 198 List<String> list = new ArrayList<>(splitChar); 199 200 target = parseString(target); 201 if (target == null) { 202 return list; 203 } 204 205 // regular express reserved keywords escape 206 StringBuilder sb = new StringBuilder(); 207 switch (splitChar) { 208 case '.': 209 case '$': 210 case '^': 211 case '{': 212 case '}': 213 case '[': 214 case ']': 215 case '(': 216 case ')': 217 case '|': 218 case '*': 219 case '+': 220 case '?': 221 case '\\': 222 sb.append("\\"). 223 append(splitChar); 224 break; 225 default: 226 sb.append(splitChar); 227 } 228 229 String[] items = target.split(sb.toString()); 230 Collections.addAll(list, items); 231 232 return list.stream(). 233 map(value -> 234 value.trim()). 235 collect(Collectors.toList()); 236 } 237 238 /** 239 * parse the target into a {@link de.isas.mztab2.model.IndexedElement} 240 * object. 241 * 242 * @param target a {@link java.lang.String} object. 243 * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement} 244 * object. 245 * @return a {@link de.isas.mztab2.model.IndexedElement} object. 246 */ 247 public static IndexedElement parseIndexedElement(String target, 248 MetadataElement element) { 249 target = parseString(target); 250 if (target == null) { 251 return null; 252 } 253 254 Pattern pattern = Pattern.compile(element + "\\[(\\d+)\\]"); 255 Matcher matcher = pattern.matcher(target); 256 if (matcher.find()) { 257 Integer id = new Integer(matcher.group(1)); 258 IndexedElement p = new IndexedElement().id(id); 259 p.elementType(element.getName()); 260 return p; 261 } else { 262 return null; 263 } 264 } 265 266 /** 267 * Parse the target into a {@link de.isas.mztab2.model.IndexedElement} list. 268 * 269 * @param target a {@link java.lang.String} object. 270 * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement} 271 * object. 272 * @return a {@link java.util.List} object. 273 */ 274 public static List<IndexedElement> parseRefList(String target, 275 MetadataElement element) { 276 List<String> list = parseStringList(MZTabConstants.COMMA, target); 277 278 List<IndexedElement> indexedElementList = new ArrayList<>(); 279 IndexedElement indexedElement; 280 for (String item : list) { 281 indexedElement = parseIndexedElement(item, element); 282 if (indexedElement == null) { 283 indexedElementList.clear(); 284 return indexedElementList; 285 } 286 indexedElementList.add(indexedElement); 287 } 288 return indexedElementList; 289 } 290 291 /** 292 * A list of '|' separated parameters 293 * 294 * @param target a {@link java.lang.String} object. 295 * @return a {@link java.util.List} object. 296 */ 297 public static List<Parameter> parseParamList(String target) { 298 List<String> list = parseStringList(BAR, target); 299 300 Parameter param; 301 SplitList<Parameter> paramList = new SplitList<>(BAR); 302 for (String item : list) { 303 param = parseParam(item); 304 if (param == null) { 305 paramList.clear(); 306 return paramList; 307 } else { 308 paramList.add(param); 309 } 310 } 311 312 return paramList; 313 } 314 315 /** 316 * A '|' delimited list of GO accessions 317 * 318 * @param target a {@link java.lang.String} object. 319 * @return a {@link java.util.List} object. 320 */ 321 public static List<String> parseGOTermList(String target) { 322 List<String> list = parseStringList(COMMA, target); 323 324 List<String> goList = new SplitList<>(COMMA); 325 for (String item : list) { 326 item = parseString(item); 327 if (item.startsWith("GO:")) { 328 goList.add(item); 329 } else { 330 goList.clear(); 331 break; 332 } 333 } 334 335 return goList; 336 } 337 338 /** 339 * <p> 340 * parseInteger.</p> 341 * 342 * @param target a {@link java.lang.String} object. 343 * @return a {@link java.lang.Integer} object. 344 */ 345 public static Integer parseInteger(String target) { 346 target = parseString(target); 347 if (target == null) { 348 return null; 349 } 350 351 Integer integer; 352 353 try { 354 integer = new Integer(target); 355 } catch (NumberFormatException e) { 356 integer = null; 357 } 358 359 return integer; 360 } 361 362 /** 363 * NOTICE: If ratios are included and the denominator is zero, the "INF" 364 * value MUST be used. If the result leads to calculation errors (for 365 * example 0/0), this MUST be reported as "not a number" ("NaN"). 366 * 367 * @param target a {@link java.lang.String} object. 368 * @return a {@link java.lang.Double} object. 369 */ 370 public static Double parseDouble(String target) { 371 target = parseString(target); 372 if (target == null) { 373 return null; 374 } 375 376 Double value; 377 try { 378 value = new Double(target); 379 } catch (NumberFormatException e) { 380 switch (target) { 381 case CALCULATE_ERROR: 382 value = Double.NaN; 383 break; 384 case INFINITY: 385 value = Double.POSITIVE_INFINITY; 386 break; 387 default: 388 value = null; 389 break; 390 } 391 } 392 393 return value; 394 } 395 396 /** 397 * <p> 398 * parseLong.</p> 399 * 400 * @param target a {@link java.lang.String} object. 401 * @return a {@link java.lang.Long} object. 402 */ 403 public static Long parseLong(String target) { 404 target = parseString(target); 405 if (target == null) { 406 return null; 407 } 408 409 try { 410 return new Long(target); 411 } catch (NumberFormatException e) { 412 return null; 413 } 414 } 415 416 /** 417 * <p> 418 * parseDoubleList.</p> 419 * 420 * @param target a {@link java.lang.String} object. 421 * @return a {@link java.util.List} object. 422 */ 423 public static List<Double> parseDoubleList(String target) { 424 List<String> list = parseStringList(BAR, target); 425 426 Double value; 427 List<Double> valueList = new ArrayList<>(BAR); 428 for (String item : list) { 429 value = parseDouble(item); 430 if (value == null) { 431 valueList.clear(); 432 break; 433 } else { 434 valueList.add(value); 435 } 436 } 437 438 return valueList; 439 } 440 441 /** 442 * <p> 443 * parseIntegerList.</p> 444 * 445 * @param target a {@link java.lang.String} object. 446 * @return a {@link java.util.List} object. 447 */ 448 public static List<Integer> parseIntegerList(String target) { 449 List<String> list = parseStringList(BAR, target); 450 451 Integer value; 452 List<Integer> valueList = new ArrayList<>(BAR); 453 for (String item : list) { 454 value = parseInteger(item); 455 if (value == null) { 456 valueList.clear(); 457 break; 458 } else { 459 valueList.add(value); 460 } 461 } 462 463 return valueList; 464 } 465 466 /** 467 * <p> 468 * parseURI.</p> 469 * 470 * @param target a {@link java.lang.String} object. 471 * @return a {@link java.net.URI} object. 472 */ 473 public static URI parseURI(String target) { 474 target = parseString(target); 475 if (target == null) { 476 return null; 477 } 478 479 URI uri; 480 481 try { 482 uri = new URI(target); 483 } catch (URISyntaxException e) { 484 uri = null; 485 } 486 487 return uri; 488 } 489 490 /** 491 * A publication on this unit. PubMed ids must be prefixed by "pubmed:", 492 * DOIs by "doi:". Multiple identifiers MUST be separated by "|". 493 * 494 * @param publication a {@link de.isas.mztab2.model.Publication} object. 495 * @param lineNumber the line number while parsing. 496 * @param target a {@link java.lang.String} object. 497 * @return a {@link de.isas.mztab2.model.Publication} object. 498 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException in case of 499 * parsing or formatting issues of the publication string. 500 */ 501 public static Publication parsePublicationItems(Publication publication, 502 int lineNumber, String target) throws MZTabException { 503 List<String> list = parseStringList(BAR, target); 504 505 PublicationItem.TypeEnum type; 506 String accession; 507 PublicationItem item; 508 for (String pub : list) { 509 pub = parseString(pub). 510 toLowerCase(); 511 if (pub == null) { 512 publication.getPublicationItems(). 513 clear(); 514 return publication; 515 } 516 String[] items = pub.split("" + COLON); 517 if (items.length == 2) { 518 type = PublicationItem.TypeEnum.fromValue(items[0]); 519 if (type == null) { 520 throw new MZTabException(new MZTabError( 521 FormatErrorType.Publication, lineNumber, target, pub)); 522 } 523 accession = items[1].trim(); 524 item = new PublicationItem().type(type). 525 accession(accession); 526 publication.addPublicationItemsItem(item); 527 } else { 528 throw new MZTabException(new MZTabError( 529 FormatErrorType.Publication, lineNumber, target, pub)); 530 } 531 532 } 533 534 return publication; 535 } 536 537 /** 538 * Parse a {@link de.isas.mztab2.model.SpectraRef} list. 539 * 540 * @param context a 541 * {@link uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext} object. 542 * @param metadata a {@link de.isas.mztab2.model.Metadata} object. 543 * @param target a {@link java.lang.String} object. 544 * @return a {@link java.util.List} object. 545 */ 546 public static List<SpectraRef> parseSpectraRefList( 547 MZTabParserContext context, Metadata metadata, String target) { 548 List<String> list = parseStringList(BAR, target); 549 List<SpectraRef> refList = new ArrayList<>(); 550 551 Pattern pattern = Pattern.compile("ms_run\\[(\\d+)\\]:(.*)"); 552 Matcher matcher; 553 Integer ms_file_id; 554 String reference; 555 SpectraRef ref; 556 for (String item : list) { 557 matcher = pattern.matcher(item.trim()); 558 if (matcher.find()) { 559 ms_file_id = new Integer(matcher.group(1)); 560 reference = matcher.group(2); 561 562 MsRun msRun = context.getMsRunMap(). 563 get(ms_file_id); 564 if (msRun == null) { 565 ref = null; 566 } else { 567 ref = new SpectraRef().msRun(msRun). 568 reference(reference); 569 } 570 571 if (ref == null) { 572 refList.clear(); 573 break; 574 } else { 575 refList.add(ref); 576 } 577 } 578 } 579 580 return refList; 581 } 582 583 /** 584 * Solve the conflict about minus char between modification position and 585 * CHEMMOD charge. For example: 13-CHEMMOD:-159 586 * 587 * @param target a {@link java.lang.String} object. 588 * @return a {@link java.lang.String} object. 589 */ 590 public static String translateMinusToUnicode(String target) { 591 Pattern pattern = Pattern.compile("(CHEMMOD:.*)(-)(.*)"); 592 Matcher matcher = pattern.matcher(target); 593 StringBuilder sb = new StringBuilder(); 594 if (matcher.find()) { 595 sb.append(matcher.group(1)); 596 sb.append("−"); 597 sb.append(matcher.group(3)); 598 599 } else { 600 sb.append(target); 601 } 602 return sb.toString(); 603 } 604 605 /** 606 * <p> 607 * translateMinusInCVtoUnicode.</p> 608 * 609 * @param target a {@link java.lang.String} object. 610 * @return a {@link java.lang.String} object. 611 */ 612 public static String translateMinusInCVtoUnicode(String target) { 613 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 614 Matcher matcher = pattern.matcher(target); 615 616 StringBuilder sb = new StringBuilder(); 617 618 int start = 0; 619 int end; 620 while (matcher.find()) { 621 end = matcher.start(1); 622 sb.append(target.substring(start, end)); 623 sb.append(matcher.group(1). 624 replaceAll("-", "−")); 625 start = matcher.end(1); 626 } 627 sb.append(target.substring(start, target.length())); 628 629 return sb.toString(); 630 } 631 632 /** 633 * <p> 634 * translateUnicodeCVTermMinus.</p> 635 * 636 * @param target a {@link java.lang.String} object. 637 * @return a {@link java.lang.String} object. 638 */ 639 public static String translateUnicodeCVTermMinus(String target) { 640 return target.replaceAll("−", "-"); 641 } 642 643 /** 644 * Solve the conflict about minus char between modification position and 645 * CHEMMOD charge. For example: 13-CHEMMOD:-159 646 * 647 * @param target a {@link java.lang.String} object. 648 * @return a {@link java.lang.String} object. 649 */ 650 public static String translateUnicodeToMinus(String target) { 651 Pattern pattern = Pattern.compile("(.*CHEMMOD:.*)(−)(.*)"); 652 Matcher matcher = pattern.matcher(target); 653 if (matcher.find()) { 654 StringBuilder sb = new StringBuilder(); 655 656 sb.append(matcher.group(1)); 657 sb.append("-"); 658 sb.append(matcher.group(3)); 659 660 return sb.toString(); 661 } else { 662 return target; 663 } 664 } 665 666 /** 667 * locate param label [label, accession, name, value], translate ',' to '\t' 668 * 669 * @param target a {@link java.lang.String} object. 670 * @return a {@link java.lang.String} object. 671 */ 672 public static String translateCommaToTab(String target) { 673 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 674 Matcher matcher = pattern.matcher(target); 675 676 StringBuilder sb = new StringBuilder(); 677 678 int start = 0; 679 int end; 680 while (matcher.find()) { 681 end = matcher.start(1); 682 sb.append(target.substring(start, end)); 683 sb.append(matcher.group(1). 684 replaceAll(",", "\t")); 685 start = matcher.end(1); 686 } 687 sb.append(target.substring(start, target.length())); 688 689 return sb.toString(); 690 } 691 692 /** 693 * solve the conflict about comma char which used in split modification and 694 * split cv param components. 695 * 696 * @param target a {@link java.lang.String} object. 697 * @return a {@link java.lang.String} object. 698 */ 699 public static String translateTabToComma(String target) { 700 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 701 Matcher matcher = pattern.matcher(target); 702 703 StringBuilder sb = new StringBuilder(); 704 705 int start = 0; 706 int end; 707 while (matcher.find()) { 708 end = matcher.start(1); 709 sb.append(target.substring(start, end)); 710 sb.append(matcher.group(1). 711 replaceAll("\t", ",")); 712 start = matcher.end(1); 713 } 714 sb.append(target.substring(start, target.length())); 715 716 return sb.toString(); 717 } 718 719 //Solve the problem for Neutral losses in CvTerm format 720 /** 721 * <p> 722 * translateMinusToTab.</p> 723 * 724 * @param target a {@link java.lang.String} object. 725 * @return a {@link java.lang.String} object. 726 */ 727 public static String translateMinusToTab(String target) { 728 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 729 Matcher matcher = pattern.matcher(target); 730 731 StringBuilder sb = new StringBuilder(); 732 733 int start = 0; 734 int end; 735 while (matcher.find()) { 736 end = matcher.start(1); 737 sb.append(target.substring(start, end)); 738 sb.append(matcher.group(1). 739 replaceAll("-", "\t")); 740 start = matcher.end(1); 741 } 742 sb.append(target.substring(start, target.length())); 743 744 return sb.toString(); 745 746 } 747 748 private static String replaceLast(String string, String toReplace, 749 String replacement) { 750 int pos = string.lastIndexOf(toReplace); 751 if (pos > -1) { 752 return string.substring(0, pos) 753 + replacement 754 + string.substring(pos + toReplace.length(), string.length()); 755 } 756 return string; 757 } 758 759 /** 760 * <p> 761 * translateLastToTab.</p> 762 * 763 * @param target a {@link java.lang.String} object. 764 * @return a {@link java.lang.String} object. 765 */ 766 public static String translateLastToTab(String target) { 767 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 768 Matcher matcher = pattern.matcher(target); 769 770 StringBuilder sb = new StringBuilder(); 771 772 int start = 0; 773 int end; 774 while (matcher.find()) { 775 end = matcher.start(1); 776 sb.append(target.substring(start, end)); 777 sb.append(replaceLast(matcher.group(1), "-", "\t")); 778 start = matcher.end(1); 779 } 780 sb.append(target.substring(start, target.length())); 781 782 return sb.toString(); 783 784 } 785 786 /** 787 * solve the conflict about comma char which used in split modification and 788 * split cv param components. 789 * 790 * @param target a {@link java.lang.String} object. 791 * @return a {@link java.lang.String} object. 792 */ 793 public static String translateTabToMinus(String target) { 794 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 795 Matcher matcher = pattern.matcher(target); 796 797 StringBuilder sb = new StringBuilder(); 798 799 int start = 0; 800 int end; 801 while (matcher.find()) { 802 end = matcher.start(1); 803 sb.append(target.substring(start, end)); 804 sb.append(matcher.group(1). 805 replaceAll("\t", "-")); 806 start = matcher.end(1); 807 } 808 sb.append(target.substring(start, target.length())); 809 810 return sb.toString(); 811 } 812 813 /** 814 * If there exists reserved characters in value, like comma, the string need 815 * to be escape. However the escaping char is not store because it will be 816 * write back in the writer. Nested double quotes are not supported. 817 * 818 * @param value a {@link java.lang.String} object. 819 * @return a {@link java.lang.String} object. 820 */ 821 public static String removeDoubleQuotes(String value) { 822 823 if (value != null) { 824 int length; 825 int count; 826 827 value = value.trim(); 828 length = value.length(); 829 830 value = value.replace("\"", ""); 831 count = length - value.length(); 832 833 if (isEmpty(value)) { 834 value = null; 835 } 836 837 if (count > 2) { 838 LOGGER.warn( 839 "Nested double quotes in value, " + count + " occurrences have been replaced."); 840 } 841 } 842 843 return value; 844 } 845 846}