001/* 002 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V.. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package uk.ac.ebi.pride.jmztab2.model; 017 018import de.isas.mztab2.model.IndexedElement; 019import de.isas.mztab2.model.IndexedElementImpl; 020import de.isas.mztab2.model.Metadata; 021import de.isas.mztab2.model.MsRun; 022import de.isas.mztab2.model.Parameter; 023import de.isas.mztab2.model.Publication; 024import de.isas.mztab2.model.PublicationItem; 025import de.isas.mztab2.model.SpectraRef; 026import java.net.URI; 027import java.net.URISyntaxException; 028import java.util.ArrayList; 029import java.util.Collections; 030import java.util.List; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033import java.util.stream.Collectors; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.*; 037import static uk.ac.ebi.pride.jmztab2.model.MZTabStringUtils.*; 038import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType; 039import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError; 040import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException; 041import uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext; 042 043/** 044 * Provide a couple of functions for translating, parsing and printing formatted strings 045 * defined in the mzTab specification. 046 * 047 * @author qingwei 048 * @author nilshoffmann 049 * @since 30/01/13 050 * 051 */ 052public class MZTabUtils { 053 054 private static final Logger LOGGER = LoggerFactory.getLogger( 055 MZTabUtils.class); 056 057 /** 058 * If ratios are included and the denominator is zero, the "INF" value MUST 059 * be used. If the result leads to calculation errors (for example 0/0), 060 * this MUST be reported as "not a number" ("NaN"). 061 * 062 * @see #parseDouble(String) 063 * @param value a {@link java.lang.Double} object. 064 * @return a {@link java.lang.String} object. 065 */ 066 public static String printDouble(Double value) { 067 if (value == null) { 068 return NULL; 069 } else if (value.equals(Double.NaN)) { 070 return CALCULATE_ERROR; 071 } else if (value.equals(Double.POSITIVE_INFINITY)) { 072 return INFINITY; 073 } else { 074 return value.toString(); 075 } 076 } 077 078 /** 079 * Parse the target string, and check is obey the email format or not. If 080 * not, return null. 081 * 082 * @param target a {@link java.lang.String} object. 083 * @return a {@link java.lang.String} object. 084 */ 085 public static String parseEmail(String target) { 086 target = parseString(target); 087 if (target == null) { 088 return null; 089 } 090 091 String regexp = REGEX_EMAIL; 092 Pattern pattern = Pattern.compile(regexp); 093 Matcher matcher = pattern.matcher(target); 094 095 return matcher.find() ? target : null; 096 } 097 098 /** 099 * Parse the target string, and check it follows the mzTab Version format. 100 * If not, return null. 101 * 102 * @param target a {@link java.lang.String} object. 103 * @return a {@link java.lang.String} object. 104 */ 105 public static String parseMzTabVersion(String target) { 106 target = parseString(target); 107 if (target == null) { 108 return null; 109 } 110 111 Pattern versionPattern = Pattern.compile(MZTabConstants.REGEX_MZTAB_M); 112 Matcher m = versionPattern.matcher(target); 113 if (m.matches()) { 114 Integer major = Integer.parseInt(m.group("major")); 115 Integer minor = Integer.parseInt(m.group("minor")); 116 Integer micro = Integer.parseInt(m.group("micro")); 117 if (major != 2) { 118 return null; 119 } 120 if (!"M".equals(m.group("profile"))) { 121 return null; 122 } 123 return target; 124 } 125 return null; 126 } 127 128 /** 129 * Parameters are always reported as [CV label, accession, name, value]. Any 130 * field that is not available MUST be left empty. 131 * 132 * If the name or value of param contains comma, quotes MUST be added to 133 * avoid problems. Nested double quotes are not supported. 134 * 135 * Notice: name cell never set null. 136 * 137 * @param target a {@link java.lang.String} object. 138 * @return a {@link de.isas.mztab2.model.Parameter} object. 139 */ 140 public static Parameter parseParam(String target) { 141 target = parseString(target); 142 if (target == null) { 143 return null; 144 } 145 146 try { 147 target = target.substring(target.indexOf("[") + 1, target. 148 lastIndexOf("]")); 149 String[] tokens = target.split(REGEX_PARAM_SPLIT, -1); 150 151 if (tokens.length == 4) { 152 String cvLabel = tokens[0].trim(); 153 154 String accession = tokens[1].trim(); 155 156 String name = tokens[2].trim(); 157 if (name.contains("\"")) { //We remove the escaping because it will be written back in the writer 158 name = removeDoubleQuotes(name); 159 } 160 161 if (isEmpty(name)) { 162 return null; 163 } 164 165 String value = tokens[3].trim(); 166 if (value.contains("\"")) { //We remove the escaping because it will be written back in the writer 167 value = removeDoubleQuotes(value); 168 } 169 if (isEmpty(value)) { 170 value = null; 171 } 172 173 if (isEmpty(cvLabel) && isEmpty(accession)) { 174 return new Parameter().name(name). 175 value(value); 176 } else { 177 return new Parameter().cvLabel(cvLabel). 178 cvAccession(accession). 179 name(name). 180 value(value); 181 } 182 } 183 } catch (IndexOutOfBoundsException e) { 184 return null; 185 } 186 187 return null; 188 189 } 190 191 /** 192 * Multiple identifiers MUST be separated by splitChar. 193 * 194 * @param splitChar a char. 195 * @param target a {@link java.lang.String} object. 196 * @return a {@link java.util.List} object. 197 */ 198 public static List<String> parseStringList(char splitChar, String target) { 199 List<String> list = new ArrayList<>(splitChar); 200 201 target = parseString(target); 202 if (target == null) { 203 return list; 204 } 205 206 // regular express reserved keywords escape 207 StringBuilder sb = new StringBuilder(); 208 switch (splitChar) { 209 case '.': 210 case '$': 211 case '^': 212 case '{': 213 case '}': 214 case '[': 215 case ']': 216 case '(': 217 case ')': 218 case '|': 219 case '*': 220 case '+': 221 case '?': 222 case '\\': 223 sb.append("\\"). 224 append(splitChar); 225 break; 226 default: 227 sb.append(splitChar); 228 } 229 230 String[] items = target.split(sb.toString()); 231 Collections.addAll(list, items); 232 233 return list.stream(). 234 map(value -> 235 value.trim()). 236 collect(Collectors.toList()); 237 } 238 239 /** 240 * parse the target into a {@link de.isas.mztab2.model.IndexedElement} 241 * object. 242 * 243 * @param target a {@link java.lang.String} object. 244 * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement} 245 * object. 246 * @return a {@link de.isas.mztab2.model.IndexedElement} object. 247 */ 248 public static IndexedElement parseIndexedElement(String target, 249 MetadataElement element) { 250 target = parseString(target); 251 if (target == null) { 252 return null; 253 } 254 255 Pattern pattern = Pattern.compile(element + "\\[(\\d+)\\]"); 256 Matcher matcher = pattern.matcher(target); 257 if (matcher.find()) { 258 Integer id = Integer.parseInt(matcher.group(1)); 259 IndexedElement p = new IndexedElementImpl(id, element.getName(), element); 260 return p; 261 } else { 262 return null; 263 } 264 } 265 266 /** 267 * Parse the target into a {@link de.isas.mztab2.model.IndexedElement} list. 268 * target is a '|' separated list of entries. 269 * 270 * @param target a {@link java.lang.String} object. 271 * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement} 272 * object. 273 * @return a {@link java.util.List} object. 274 */ 275 public static List<IndexedElement> parseRefList(String target, 276 MetadataElement element) { 277 List<String> list = parseStringList(MZTabConstants.BAR, target); 278 279 List<IndexedElement> indexedElementList = new ArrayList<>(); 280 IndexedElement indexedElement; 281 for (String item : list) { 282 indexedElement = parseIndexedElement(item, element); 283 if (indexedElement == null) { 284 indexedElementList.clear(); 285 return indexedElementList; 286 } 287 indexedElementList.add(indexedElement); 288 } 289 return indexedElementList; 290 } 291 292 /** 293 * A list of '|' separated parameters 294 * 295 * @param target a {@link java.lang.String} object. 296 * @return a {@link java.util.List} object. 297 */ 298 public static List<Parameter> parseParamList(String target) { 299 List<String> list = parseStringList(BAR, target); 300 301 Parameter param; 302 SplitList<Parameter> paramList = new SplitList<>(BAR); 303 for (String item : list) { 304 param = parseParam(item); 305 if (param == null) { 306 paramList.clear(); 307 return paramList; 308 } else { 309 paramList.add(param); 310 } 311 } 312 313 return paramList; 314 } 315 316 /** 317 * A '|' delimited list of GO accessions 318 * 319 * @param target a {@link java.lang.String} object. 320 * @return a {@link java.util.List} object. 321 */ 322 public static List<String> parseGOTermList(String target) { 323 List<String> list = parseStringList(COMMA, target); 324 325 List<String> goList = new SplitList<>(COMMA); 326 for (String item : list) { 327 item = parseString(item); 328 if (item.startsWith("GO:")) { 329 goList.add(item); 330 } else { 331 goList.clear(); 332 break; 333 } 334 } 335 336 return goList; 337 } 338 339 /** 340 * <p> 341 * parseInteger.</p> 342 * 343 * @param target a {@link java.lang.String} object. 344 * @return a {@link java.lang.Integer} object. 345 */ 346 public static Integer parseInteger(String target) { 347 target = parseString(target); 348 if (target == null) { 349 return null; 350 } 351 352 Integer integer; 353 354 try { 355 integer = new Integer(target); 356 } catch (NumberFormatException e) { 357 integer = null; 358 } 359 360 return integer; 361 } 362 363 /** 364 * NOTICE: If ratios are included and the denominator is zero, the "INF" 365 * value MUST be used. If the result leads to calculation errors (for 366 * example 0/0), this MUST be reported as "not a number" ("NaN"). 367 * 368 * @param target a {@link java.lang.String} object. 369 * @return a {@link java.lang.Double} object. 370 */ 371 public static Double parseDouble(String target) { 372 target = parseString(target); 373 if (target == null) { 374 return null; 375 } 376 377 Double value; 378 try { 379 value = new Double(target); 380 } catch (NumberFormatException e) { 381 switch (target) { 382 case CALCULATE_ERROR: 383 value = Double.NaN; 384 break; 385 case INFINITY: 386 value = Double.POSITIVE_INFINITY; 387 break; 388 default: 389 value = null; 390 break; 391 } 392 } 393 394 return value; 395 } 396 397 /** 398 * <p> 399 * parseLong.</p> 400 * 401 * @param target a {@link java.lang.String} object. 402 * @return a {@link java.lang.Long} object. 403 */ 404 public static Long parseLong(String target) { 405 target = parseString(target); 406 if (target == null) { 407 return null; 408 } 409 410 try { 411 return new Long(target); 412 } catch (NumberFormatException e) { 413 return null; 414 } 415 } 416 417 /** 418 * <p> 419 * parseDoubleList.</p> 420 * 421 * @param target a {@link java.lang.String} object. 422 * @return a {@link java.util.List} object. 423 */ 424 public static List<Double> parseDoubleList(String target) { 425 List<String> list = parseStringList(BAR, target); 426 427 Double value; 428 List<Double> valueList = new ArrayList<>(BAR); 429 for (String item : list) { 430 value = parseDouble(item); 431 if (value == null) { 432 valueList.clear(); 433 break; 434 } else { 435 valueList.add(value); 436 } 437 } 438 439 return valueList; 440 } 441 442 /** 443 * <p> 444 * parseIntegerList.</p> 445 * 446 * @param target a {@link java.lang.String} object. 447 * @return a {@link java.util.List} object. 448 */ 449 public static List<Integer> parseIntegerList(String target) { 450 List<String> list = parseStringList(BAR, target); 451 452 Integer value; 453 List<Integer> valueList = new ArrayList<>(BAR); 454 for (String item : list) { 455 value = parseInteger(item); 456 if (value == null) { 457 valueList.clear(); 458 break; 459 } else { 460 valueList.add(value); 461 } 462 } 463 464 return valueList; 465 } 466 467 /** 468 * <p> 469 * parseURI.</p> 470 * 471 * @param target a {@link java.lang.String} object. 472 * @return a {@link java.net.URI} object. 473 */ 474 public static URI parseURI(String target) { 475 target = parseString(target); 476 if (target == null) { 477 return null; 478 } 479 480 URI uri; 481 482 try { 483 uri = new URI(target); 484 } catch (URISyntaxException e) { 485 uri = null; 486 } 487 488 return uri; 489 } 490 491 /** 492 * A publication on this unit. PubMed ids must be prefixed by "pubmed:", 493 * DOIs by "doi:". Multiple identifiers MUST be separated by "|". 494 * 495 * @param publication a {@link de.isas.mztab2.model.Publication} object. 496 * @param lineNumber the line number while parsing. 497 * @param target a {@link java.lang.String} object. 498 * @return a {@link de.isas.mztab2.model.Publication} object. 499 * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException in case of 500 * parsing or formatting issues of the publication string. 501 */ 502 public static Publication parsePublicationItems(Publication publication, 503 int lineNumber, String target) throws MZTabException { 504 List<String> list = parseStringList(BAR, target); 505 506 PublicationItem.TypeEnum type; 507 String accession; 508 PublicationItem item; 509 for (String pub : list) { 510 pub = parseString(pub). 511 toLowerCase(); 512 if (pub == null) { 513 publication.getPublicationItems(). 514 clear(); 515 return publication; 516 } 517 String[] items = pub.split("" + COLON); 518 if (items.length == 2) { 519 type = PublicationItem.TypeEnum.fromValue(items[0]); 520 if (type == null) { 521 throw new MZTabException(new MZTabError( 522 FormatErrorType.Publication, lineNumber, target, pub)); 523 } 524 accession = items[1].trim(); 525 item = new PublicationItem().type(type). 526 accession(accession); 527 publication.addPublicationItemsItem(item); 528 } else { 529 throw new MZTabException(new MZTabError( 530 FormatErrorType.Publication, lineNumber, target, pub)); 531 } 532 533 } 534 535 return publication; 536 } 537 538 /** 539 * Parse a {@link de.isas.mztab2.model.SpectraRef} list. 540 * 541 * @param context a 542 * {@link uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext} object. 543 * @param metadata a {@link de.isas.mztab2.model.Metadata} object. 544 * @param target a {@link java.lang.String} object. 545 * @return a {@link java.util.List} object. 546 */ 547 public static List<SpectraRef> parseSpectraRefList( 548 MZTabParserContext context, Metadata metadata, String target) { 549 List<String> list = parseStringList(BAR, target); 550 List<SpectraRef> refList = new ArrayList<>(); 551 552 Pattern pattern = Pattern.compile("ms_run\\[(\\d+)\\]:(.*)"); 553 Matcher matcher; 554 Integer ms_file_id; 555 String reference; 556 SpectraRef ref; 557 for (String item : list) { 558 matcher = pattern.matcher(item.trim()); 559 if (matcher.find()) { 560 ms_file_id = new Integer(matcher.group(1)); 561 reference = matcher.group(2); 562 563 MsRun msRun = context.getMsRunMap(). 564 get(ms_file_id); 565 if (msRun == null) { 566 ref = null; 567 } else { 568 ref = new SpectraRef().msRun(msRun). 569 reference(reference); 570 } 571 572 if (ref == null) { 573 refList.clear(); 574 break; 575 } else { 576 refList.add(ref); 577 } 578 } 579 } 580 581 return refList; 582 } 583 584 /** 585 * Solve the conflict about minus char between modification position and 586 * CHEMMOD charge. For example: 13-CHEMMOD:-159 587 * 588 * @param target a {@link java.lang.String} object. 589 * @return a {@link java.lang.String} object. 590 */ 591 public static String translateMinusToUnicode(String target) { 592 Pattern pattern = Pattern.compile("(CHEMMOD:.*)(-)(.*)"); 593 Matcher matcher = pattern.matcher(target); 594 StringBuilder sb = new StringBuilder(); 595 if (matcher.find()) { 596 sb.append(matcher.group(1)); 597 sb.append("−"); 598 sb.append(matcher.group(3)); 599 600 } else { 601 sb.append(target); 602 } 603 return sb.toString(); 604 } 605 606 /** 607 * <p> 608 * translateMinusInCVtoUnicode.</p> 609 * 610 * @param target a {@link java.lang.String} object. 611 * @return a {@link java.lang.String} object. 612 */ 613 public static String translateMinusInCVtoUnicode(String target) { 614 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 615 Matcher matcher = pattern.matcher(target); 616 617 StringBuilder sb = new StringBuilder(); 618 619 int start = 0; 620 int end; 621 while (matcher.find()) { 622 end = matcher.start(1); 623 sb.append(target.substring(start, end)); 624 sb.append(matcher.group(1). 625 replaceAll("-", "−")); 626 start = matcher.end(1); 627 } 628 sb.append(target.substring(start, target.length())); 629 630 return sb.toString(); 631 } 632 633 /** 634 * <p> 635 * translateUnicodeCVTermMinus.</p> 636 * 637 * @param target a {@link java.lang.String} object. 638 * @return a {@link java.lang.String} object. 639 */ 640 public static String translateUnicodeCVTermMinus(String target) { 641 return target.replaceAll("−", "-"); 642 } 643 644 /** 645 * Solve the conflict about minus char between modification position and 646 * CHEMMOD charge. For example: 13-CHEMMOD:-159 647 * 648 * @param target a {@link java.lang.String} object. 649 * @return a {@link java.lang.String} object. 650 */ 651 public static String translateUnicodeToMinus(String target) { 652 Pattern pattern = Pattern.compile("(.*CHEMMOD:.*)(−)(.*)"); 653 Matcher matcher = pattern.matcher(target); 654 if (matcher.find()) { 655 StringBuilder sb = new StringBuilder(); 656 657 sb.append(matcher.group(1)); 658 sb.append("-"); 659 sb.append(matcher.group(3)); 660 661 return sb.toString(); 662 } else { 663 return target; 664 } 665 } 666 667 /** 668 * locate param label [label, accession, name, value], translate ',' to '\t' 669 * 670 * @param target a {@link java.lang.String} object. 671 * @return a {@link java.lang.String} object. 672 */ 673 public static String translateCommaToTab(String target) { 674 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 675 Matcher matcher = pattern.matcher(target); 676 677 StringBuilder sb = new StringBuilder(); 678 679 int start = 0; 680 int end; 681 while (matcher.find()) { 682 end = matcher.start(1); 683 sb.append(target.substring(start, end)); 684 sb.append(matcher.group(1). 685 replaceAll(",", "\t")); 686 start = matcher.end(1); 687 } 688 sb.append(target.substring(start, target.length())); 689 690 return sb.toString(); 691 } 692 693 /** 694 * solve the conflict about comma char which used in split modification and 695 * split cv param components. 696 * 697 * @param target a {@link java.lang.String} object. 698 * @return a {@link java.lang.String} object. 699 */ 700 public static String translateTabToComma(String target) { 701 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 702 Matcher matcher = pattern.matcher(target); 703 704 StringBuilder sb = new StringBuilder(); 705 706 int start = 0; 707 int end; 708 while (matcher.find()) { 709 end = matcher.start(1); 710 sb.append(target.substring(start, end)); 711 sb.append(matcher.group(1). 712 replaceAll("\t", ",")); 713 start = matcher.end(1); 714 } 715 sb.append(target.substring(start, target.length())); 716 717 return sb.toString(); 718 } 719 720 //Solve the problem for Neutral losses in CvTerm format 721 /** 722 * <p> 723 * translateMinusToTab.</p> 724 * 725 * @param target a {@link java.lang.String} object. 726 * @return a {@link java.lang.String} object. 727 */ 728 public static String translateMinusToTab(String target) { 729 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 730 Matcher matcher = pattern.matcher(target); 731 732 StringBuilder sb = new StringBuilder(); 733 734 int start = 0; 735 int end; 736 while (matcher.find()) { 737 end = matcher.start(1); 738 sb.append(target.substring(start, end)); 739 sb.append(matcher.group(1). 740 replaceAll("-", "\t")); 741 start = matcher.end(1); 742 } 743 sb.append(target.substring(start, target.length())); 744 745 return sb.toString(); 746 747 } 748 749 private static String replaceLast(String string, String toReplace, 750 String replacement) { 751 int pos = string.lastIndexOf(toReplace); 752 if (pos > -1) { 753 return string.substring(0, pos) 754 + replacement 755 + string.substring(pos + toReplace.length(), string.length()); 756 } 757 return string; 758 } 759 760 /** 761 * <p> 762 * translateLastToTab.</p> 763 * 764 * @param target a {@link java.lang.String} object. 765 * @return a {@link java.lang.String} object. 766 */ 767 public static String translateLastToTab(String target) { 768 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 769 Matcher matcher = pattern.matcher(target); 770 771 StringBuilder sb = new StringBuilder(); 772 773 int start = 0; 774 int end; 775 while (matcher.find()) { 776 end = matcher.start(1); 777 sb.append(target.substring(start, end)); 778 sb.append(replaceLast(matcher.group(1), "-", "\t")); 779 start = matcher.end(1); 780 } 781 sb.append(target.substring(start, target.length())); 782 783 return sb.toString(); 784 785 } 786 787 /** 788 * solve the conflict about comma char which used in split modification and 789 * split cv param components. 790 * 791 * @param target a {@link java.lang.String} object. 792 * @return a {@link java.lang.String} object. 793 */ 794 public static String translateTabToMinus(String target) { 795 Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]"); 796 Matcher matcher = pattern.matcher(target); 797 798 StringBuilder sb = new StringBuilder(); 799 800 int start = 0; 801 int end; 802 while (matcher.find()) { 803 end = matcher.start(1); 804 sb.append(target.substring(start, end)); 805 sb.append(matcher.group(1). 806 replaceAll("\t", "-")); 807 start = matcher.end(1); 808 } 809 sb.append(target.substring(start, target.length())); 810 811 return sb.toString(); 812 } 813 814 /** 815 * If there exists reserved characters in value, like comma, the string need 816 * to be escape. However the escaping char is not store because it will be 817 * write back in the writer. Nested double quotes are not supported. 818 * 819 * @param value a {@link java.lang.String} object. 820 * @return a {@link java.lang.String} object. 821 */ 822 public static String removeDoubleQuotes(String value) { 823 824 if (value != null) { 825 int length; 826 int count; 827 828 value = value.trim(); 829 length = value.length(); 830 831 value = value.replace("\"", ""); 832 count = length - value.length(); 833 834 if (isEmpty(value)) { 835 value = null; 836 } 837 838 if (count > 2) { 839 LOGGER.warn( 840 "Nested double quotes in value, " + count + " occurrences have been replaced."); 841 } 842 } 843 844 return value; 845 } 846 847}