001/* 
002 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V..
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package uk.ac.ebi.pride.jmztab2.model;
017
018import de.isas.mztab2.model.IndexedElement;
019import de.isas.mztab2.model.IndexedElementImpl;
020import de.isas.mztab2.model.Metadata;
021import de.isas.mztab2.model.MsRun;
022import de.isas.mztab2.model.Parameter;
023import de.isas.mztab2.model.Publication;
024import de.isas.mztab2.model.PublicationItem;
025import de.isas.mztab2.model.SpectraRef;
026import java.net.URI;
027import java.net.URISyntaxException;
028import java.util.ArrayList;
029import java.util.Collections;
030import java.util.List;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033import java.util.stream.Collectors;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.*;
037import static uk.ac.ebi.pride.jmztab2.model.MZTabStringUtils.*;
038import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType;
039import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError;
040import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException;
041import uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext;
042
043/**
044 * Provide a couple of functions for translating, parsing and printing formatted strings
045 * defined in the mzTab specification.
046 *
047 * @author qingwei
048 * @author nilshoffmann
049 * @since 30/01/13
050 *
051 */
052public class MZTabUtils {
053    
054    private static final Logger LOGGER = LoggerFactory.getLogger(
055        MZTabUtils.class);
056
057    /**
058     * If ratios are included and the denominator is zero, the "INF" value MUST
059     * be used. If the result leads to calculation errors (for example 0/0),
060     * this MUST be reported as "not a number" ("NaN").
061     *
062     * @see #parseDouble(String)
063     * @param value a {@link java.lang.Double} object.
064     * @return a {@link java.lang.String} object.
065     */
066    public static String printDouble(Double value) {
067        if (value == null) {
068            return NULL;
069        } else if (value.equals(Double.NaN)) {
070            return CALCULATE_ERROR;
071        } else if (value.equals(Double.POSITIVE_INFINITY)) {
072            return INFINITY;
073        } else {
074            return value.toString();
075        }
076    }
077
078    /**
079     * Parse the target string, and check is obey the email format or not. If
080     * not, return null.
081     *
082     * @param target a {@link java.lang.String} object.
083     * @return a {@link java.lang.String} object.
084     */
085    public static String parseEmail(String target) {
086        target = parseString(target);
087        if (target == null) {
088            return null;
089        }
090        
091        String regexp = REGEX_EMAIL;
092        Pattern pattern = Pattern.compile(regexp);
093        Matcher matcher = pattern.matcher(target);
094        
095        return matcher.find() ? target : null;
096    }
097
098    /**
099     * Parse the target string, and check it follows the mzTab Version format.
100     * If not, return null.
101     *
102     * @param target a {@link java.lang.String} object.
103     * @return a {@link java.lang.String} object.
104     */
105    public static String parseMzTabVersion(String target) {
106        target = parseString(target);
107        if (target == null) {
108            return null;
109        }
110        
111        Pattern versionPattern = Pattern.compile(MZTabConstants.REGEX_MZTAB_M);
112        Matcher m = versionPattern.matcher(target);
113        if (m.matches()) {
114            Integer major = Integer.parseInt(m.group("major"));
115            Integer minor = Integer.parseInt(m.group("minor"));
116            Integer micro = Integer.parseInt(m.group("micro"));
117            if (major != 2) {
118                return null;
119            }
120            if (!"M".equals(m.group("profile"))) {
121                return null;
122            }
123            return target;
124        }
125        return null;
126    }
127
128    /**
129     * Parameters are always reported as [CV label, accession, name, value]. Any
130     * field that is not available MUST be left empty.
131     *
132     * If the name or value of param contains comma, quotes MUST be added to
133     * avoid problems. Nested double quotes are not supported.
134     *
135     * Notice: name cell never set null.
136     *
137     * @param target a {@link java.lang.String} object.
138     * @return a {@link de.isas.mztab2.model.Parameter} object.
139     */
140    public static Parameter parseParam(String target) {
141        target = parseString(target);
142        if (target == null) {
143            return null;
144        }
145        
146        try {
147            target = target.substring(target.indexOf("[") + 1, target.
148                lastIndexOf("]"));
149            String[] tokens = target.split(REGEX_PARAM_SPLIT, -1);
150            
151            if (tokens.length == 4) {
152                String cvLabel = tokens[0].trim();
153                
154                String accession = tokens[1].trim();
155                
156                String name = tokens[2].trim();
157                if (name.contains("\"")) {  //We remove the escaping because it will be written back in the writer
158                    name = removeDoubleQuotes(name);
159                }
160                
161                if (isEmpty(name)) {
162                    return null;
163                }
164                
165                String value = tokens[3].trim();
166                if (value.contains("\"")) {  //We remove the escaping because it will be written back in the writer
167                    value = removeDoubleQuotes(value);
168                }
169                if (isEmpty(value)) {
170                    value = null;
171                }
172                
173                if (isEmpty(cvLabel) && isEmpty(accession)) {
174                    return new Parameter().name(name).
175                        value(value);
176                } else {
177                    return new Parameter().cvLabel(cvLabel).
178                        cvAccession(accession).
179                        name(name).
180                        value(value);
181                }
182            }
183        } catch (IndexOutOfBoundsException e) {
184            return null;
185        }
186        
187        return null;
188        
189    }
190
191    /**
192     * Multiple identifiers MUST be separated by splitChar.
193     *
194     * @param splitChar a char.
195     * @param target a {@link java.lang.String} object.
196     * @return a {@link java.util.List} object.
197     */
198    public static List<String> parseStringList(char splitChar, String target) {
199        List<String> list = new ArrayList<>(splitChar);
200        
201        target = parseString(target);
202        if (target == null) {
203            return list;
204        }
205
206        // regular express reserved keywords escape
207        StringBuilder sb = new StringBuilder();
208        switch (splitChar) {
209            case '.':
210            case '$':
211            case '^':
212            case '{':
213            case '}':
214            case '[':
215            case ']':
216            case '(':
217            case ')':
218            case '|':
219            case '*':
220            case '+':
221            case '?':
222            case '\\':
223                sb.append("\\").
224                    append(splitChar);
225                break;
226            default:
227                sb.append(splitChar);
228        }
229        
230        String[] items = target.split(sb.toString());
231        Collections.addAll(list, items);
232        
233        return list.stream().
234            map(value ->
235                value.trim()).
236            collect(Collectors.toList());
237    }
238
239    /**
240     * parse the target into a {@link de.isas.mztab2.model.IndexedElement}
241     * object.
242     *
243     * @param target a {@link java.lang.String} object.
244     * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement}
245     * object.
246     * @return a {@link de.isas.mztab2.model.IndexedElement} object.
247     */
248    public static IndexedElement parseIndexedElement(String target,
249        MetadataElement element) {
250        target = parseString(target);
251        if (target == null) {
252            return null;
253        }
254        
255        Pattern pattern = Pattern.compile(element + "\\[(\\d+)\\]");
256        Matcher matcher = pattern.matcher(target);
257        if (matcher.find()) {
258            Integer id = Integer.parseInt(matcher.group(1));
259            IndexedElement p = new IndexedElementImpl(id, element.getName(), element);
260            return p;
261        } else {
262            return null;
263        }
264    }
265
266    /**
267     * Parse the target into a {@link de.isas.mztab2.model.IndexedElement} list.
268     * target is a '|' separated list of entries.
269     *
270     * @param target a {@link java.lang.String} object.
271     * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement}
272     * object.
273     * @return a {@link java.util.List} object.
274     */
275    public static List<IndexedElement> parseRefList(String target,
276        MetadataElement element) {
277        List<String> list = parseStringList(MZTabConstants.BAR, target);
278        
279        List<IndexedElement> indexedElementList = new ArrayList<>();
280        IndexedElement indexedElement;
281        for (String item : list) {
282            indexedElement = parseIndexedElement(item, element);
283            if (indexedElement == null) {
284                indexedElementList.clear();
285                return indexedElementList;
286            }
287            indexedElementList.add(indexedElement);
288        }
289        return indexedElementList;
290    }
291
292    /**
293     * A list of '|' separated parameters
294     *
295     * @param target a {@link java.lang.String} object.
296     * @return a {@link java.util.List} object.
297     */
298    public static List<Parameter> parseParamList(String target) {
299        List<String> list = parseStringList(BAR, target);
300        
301        Parameter param;
302        SplitList<Parameter> paramList = new SplitList<>(BAR);
303        for (String item : list) {
304            param = parseParam(item);
305            if (param == null) {
306                paramList.clear();
307                return paramList;
308            } else {
309                paramList.add(param);
310            }
311        }
312        
313        return paramList;
314    }
315
316    /**
317     * A '|' delimited list of GO accessions
318     *
319     * @param target a {@link java.lang.String} object.
320     * @return a {@link java.util.List} object.
321     */
322    public static List<String> parseGOTermList(String target) {
323        List<String> list = parseStringList(COMMA, target);
324        
325        List<String> goList = new SplitList<>(COMMA);
326        for (String item : list) {
327            item = parseString(item);
328            if (item.startsWith("GO:")) {
329                goList.add(item);
330            } else {
331                goList.clear();
332                break;
333            }
334        }
335        
336        return goList;
337    }
338
339    /**
340     * <p>
341     * parseInteger.</p>
342     *
343     * @param target a {@link java.lang.String} object.
344     * @return a {@link java.lang.Integer} object.
345     */
346    public static Integer parseInteger(String target) {
347        target = parseString(target);
348        if (target == null) {
349            return null;
350        }
351        
352        Integer integer;
353        
354        try {
355            integer = new Integer(target);
356        } catch (NumberFormatException e) {
357            integer = null;
358        }
359        
360        return integer;
361    }
362
363    /**
364     * NOTICE: If ratios are included and the denominator is zero, the "INF"
365     * value MUST be used. If the result leads to calculation errors (for
366     * example 0/0), this MUST be reported as "not a number" ("NaN").
367     *
368     * @param target a {@link java.lang.String} object.
369     * @return a {@link java.lang.Double} object.
370     */
371    public static Double parseDouble(String target) {
372        target = parseString(target);
373        if (target == null) {
374            return null;
375        }
376        
377        Double value;
378        try {
379            value = new Double(target);
380        } catch (NumberFormatException e) {
381            switch (target) {
382                case CALCULATE_ERROR:
383                    value = Double.NaN;
384                    break;
385                case INFINITY:
386                    value = Double.POSITIVE_INFINITY;
387                    break;
388                default:
389                    value = null;
390                    break;
391            }
392        }
393        
394        return value;
395    }
396
397    /**
398     * <p>
399     * parseLong.</p>
400     *
401     * @param target a {@link java.lang.String} object.
402     * @return a {@link java.lang.Long} object.
403     */
404    public static Long parseLong(String target) {
405        target = parseString(target);
406        if (target == null) {
407            return null;
408        }
409        
410        try {
411            return new Long(target);
412        } catch (NumberFormatException e) {
413            return null;
414        }
415    }
416
417    /**
418     * <p>
419     * parseDoubleList.</p>
420     *
421     * @param target a {@link java.lang.String} object.
422     * @return a {@link java.util.List} object.
423     */
424    public static List<Double> parseDoubleList(String target) {
425        List<String> list = parseStringList(BAR, target);
426        
427        Double value;
428        List<Double> valueList = new ArrayList<>(BAR);
429        for (String item : list) {
430            value = parseDouble(item);
431            if (value == null) {
432                valueList.clear();
433                break;
434            } else {
435                valueList.add(value);
436            }
437        }
438        
439        return valueList;
440    }
441
442    /**
443     * <p>
444     * parseIntegerList.</p>
445     *
446     * @param target a {@link java.lang.String} object.
447     * @return a {@link java.util.List} object.
448     */
449    public static List<Integer> parseIntegerList(String target) {
450        List<String> list = parseStringList(BAR, target);
451        
452        Integer value;
453        List<Integer> valueList = new ArrayList<>(BAR);
454        for (String item : list) {
455            value = parseInteger(item);
456            if (value == null) {
457                valueList.clear();
458                break;
459            } else {
460                valueList.add(value);
461            }
462        }
463        
464        return valueList;
465    }
466
467    /**
468     * <p>
469     * parseURI.</p>
470     *
471     * @param target a {@link java.lang.String} object.
472     * @return a {@link java.net.URI} object.
473     */
474    public static URI parseURI(String target) {
475        target = parseString(target);
476        if (target == null) {
477            return null;
478        }
479        
480        URI uri;
481        
482        try {
483            uri = new URI(target);
484        } catch (URISyntaxException e) {
485            uri = null;
486        }
487        
488        return uri;
489    }
490
491    /**
492     * A publication on this unit. PubMed ids must be prefixed by "pubmed:",
493     * DOIs by "doi:". Multiple identifiers MUST be separated by "|".
494     *
495     * @param publication a {@link de.isas.mztab2.model.Publication} object.
496     * @param lineNumber the line number while parsing.
497     * @param target a {@link java.lang.String} object.
498     * @return a {@link de.isas.mztab2.model.Publication} object.
499     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException in case of
500     * parsing or formatting issues of the publication string.
501     */
502    public static Publication parsePublicationItems(Publication publication,
503        int lineNumber, String target) throws MZTabException {
504        List<String> list = parseStringList(BAR, target);
505        
506        PublicationItem.TypeEnum type;
507        String accession;
508        PublicationItem item;
509        for (String pub : list) {
510            pub = parseString(pub).
511                toLowerCase();
512            if (pub == null) {
513                publication.getPublicationItems().
514                    clear();
515                return publication;
516            }
517            String[] items = pub.split("" + COLON);
518            if (items.length == 2) {
519                type = PublicationItem.TypeEnum.fromValue(items[0]);
520                if (type == null) {
521                    throw new MZTabException(new MZTabError(
522                        FormatErrorType.Publication, lineNumber, target, pub));
523                }
524                accession = items[1].trim();
525                item = new PublicationItem().type(type).
526                    accession(accession);
527                publication.addPublicationItemsItem(item);
528            } else {
529                throw new MZTabException(new MZTabError(
530                    FormatErrorType.Publication, lineNumber, target, pub));
531            }
532            
533        }
534        
535        return publication;
536    }
537
538    /**
539     * Parse a {@link de.isas.mztab2.model.SpectraRef} list.
540     *
541     * @param context a
542     * {@link uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext} object.
543     * @param metadata a {@link de.isas.mztab2.model.Metadata} object.
544     * @param target a {@link java.lang.String} object.
545     * @return a {@link java.util.List} object.
546     */
547    public static List<SpectraRef> parseSpectraRefList(
548        MZTabParserContext context, Metadata metadata, String target) {
549        List<String> list = parseStringList(BAR, target);
550        List<SpectraRef> refList = new ArrayList<>();
551        
552        Pattern pattern = Pattern.compile("ms_run\\[(\\d+)\\]:(.*)");
553        Matcher matcher;
554        Integer ms_file_id;
555        String reference;
556        SpectraRef ref;
557        for (String item : list) {
558            matcher = pattern.matcher(item.trim());
559            if (matcher.find()) {
560                ms_file_id = new Integer(matcher.group(1));
561                reference = matcher.group(2);
562                
563                MsRun msRun = context.getMsRunMap().
564                    get(ms_file_id);
565                if (msRun == null) {
566                    ref = null;
567                } else {
568                    ref = new SpectraRef().msRun(msRun).
569                        reference(reference);
570                }
571                
572                if (ref == null) {
573                    refList.clear();
574                    break;
575                } else {
576                    refList.add(ref);
577                }
578            }
579        }
580        
581        return refList;
582    }
583
584    /**
585     * Solve the conflict about minus char between modification position and
586     * CHEMMOD charge. For example: 13-CHEMMOD:-159
587     *
588     * @param target a {@link java.lang.String} object.
589     * @return a {@link java.lang.String} object.
590     */
591    public static String translateMinusToUnicode(String target) {
592        Pattern pattern = Pattern.compile("(CHEMMOD:.*)(-)(.*)");
593        Matcher matcher = pattern.matcher(target);
594        StringBuilder sb = new StringBuilder();
595        if (matcher.find()) {
596            sb.append(matcher.group(1));
597            sb.append("&minus;");
598            sb.append(matcher.group(3));
599            
600        } else {
601            sb.append(target);
602        }
603        return sb.toString();
604    }
605
606    /**
607     * <p>
608     * translateMinusInCVtoUnicode.</p>
609     *
610     * @param target a {@link java.lang.String} object.
611     * @return a {@link java.lang.String} object.
612     */
613    public static String translateMinusInCVtoUnicode(String target) {
614        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
615        Matcher matcher = pattern.matcher(target);
616        
617        StringBuilder sb = new StringBuilder();
618        
619        int start = 0;
620        int end;
621        while (matcher.find()) {
622            end = matcher.start(1);
623            sb.append(target.substring(start, end));
624            sb.append(matcher.group(1).
625                replaceAll("-", "&minus;"));
626            start = matcher.end(1);
627        }
628        sb.append(target.substring(start, target.length()));
629        
630        return sb.toString();
631    }
632
633    /**
634     * <p>
635     * translateUnicodeCVTermMinus.</p>
636     *
637     * @param target a {@link java.lang.String} object.
638     * @return a {@link java.lang.String} object.
639     */
640    public static String translateUnicodeCVTermMinus(String target) {
641        return target.replaceAll("&minus;", "-");
642    }
643
644    /**
645     * Solve the conflict about minus char between modification position and
646     * CHEMMOD charge. For example: 13-CHEMMOD:-159
647     *
648     * @param target a {@link java.lang.String} object.
649     * @return a {@link java.lang.String} object.
650     */
651    public static String translateUnicodeToMinus(String target) {
652        Pattern pattern = Pattern.compile("(.*CHEMMOD:.*)(&minus;)(.*)");
653        Matcher matcher = pattern.matcher(target);
654        if (matcher.find()) {
655            StringBuilder sb = new StringBuilder();
656            
657            sb.append(matcher.group(1));
658            sb.append("-");
659            sb.append(matcher.group(3));
660            
661            return sb.toString();
662        } else {
663            return target;
664        }
665    }
666
667    /**
668     * locate param label [label, accession, name, value], translate ',' to '\t'
669     *
670     * @param target a {@link java.lang.String} object.
671     * @return a {@link java.lang.String} object.
672     */
673    public static String translateCommaToTab(String target) {
674        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
675        Matcher matcher = pattern.matcher(target);
676        
677        StringBuilder sb = new StringBuilder();
678        
679        int start = 0;
680        int end;
681        while (matcher.find()) {
682            end = matcher.start(1);
683            sb.append(target.substring(start, end));
684            sb.append(matcher.group(1).
685                replaceAll(",", "\t"));
686            start = matcher.end(1);
687        }
688        sb.append(target.substring(start, target.length()));
689        
690        return sb.toString();
691    }
692
693    /**
694     * solve the conflict about comma char which used in split modification and
695     * split cv param components.
696     *
697     * @param target a {@link java.lang.String} object.
698     * @return a {@link java.lang.String} object.
699     */
700    public static String translateTabToComma(String target) {
701        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
702        Matcher matcher = pattern.matcher(target);
703        
704        StringBuilder sb = new StringBuilder();
705        
706        int start = 0;
707        int end;
708        while (matcher.find()) {
709            end = matcher.start(1);
710            sb.append(target.substring(start, end));
711            sb.append(matcher.group(1).
712                replaceAll("\t", ","));
713            start = matcher.end(1);
714        }
715        sb.append(target.substring(start, target.length()));
716        
717        return sb.toString();
718    }
719
720    //Solve the problem for Neutral losses in CvTerm format
721    /**
722     * <p>
723     * translateMinusToTab.</p>
724     *
725     * @param target a {@link java.lang.String} object.
726     * @return a {@link java.lang.String} object.
727     */
728    public static String translateMinusToTab(String target) {
729        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
730        Matcher matcher = pattern.matcher(target);
731        
732        StringBuilder sb = new StringBuilder();
733        
734        int start = 0;
735        int end;
736        while (matcher.find()) {
737            end = matcher.start(1);
738            sb.append(target.substring(start, end));
739            sb.append(matcher.group(1).
740                replaceAll("-", "\t"));
741            start = matcher.end(1);
742        }
743        sb.append(target.substring(start, target.length()));
744        
745        return sb.toString();
746        
747    }
748    
749    private static String replaceLast(String string, String toReplace,
750        String replacement) {
751        int pos = string.lastIndexOf(toReplace);
752        if (pos > -1) {
753            return string.substring(0, pos)
754                + replacement
755                + string.substring(pos + toReplace.length(), string.length());
756        }
757        return string;
758    }
759
760    /**
761     * <p>
762     * translateLastToTab.</p>
763     *
764     * @param target a {@link java.lang.String} object.
765     * @return a {@link java.lang.String} object.
766     */
767    public static String translateLastToTab(String target) {
768        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
769        Matcher matcher = pattern.matcher(target);
770        
771        StringBuilder sb = new StringBuilder();
772        
773        int start = 0;
774        int end;
775        while (matcher.find()) {
776            end = matcher.start(1);
777            sb.append(target.substring(start, end));
778            sb.append(replaceLast(matcher.group(1), "-", "\t"));
779            start = matcher.end(1);
780        }
781        sb.append(target.substring(start, target.length()));
782        
783        return sb.toString();
784        
785    }
786
787    /**
788     * solve the conflict about comma char which used in split modification and
789     * split cv param components.
790     *
791     * @param target a {@link java.lang.String} object.
792     * @return a {@link java.lang.String} object.
793     */
794    public static String translateTabToMinus(String target) {
795        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
796        Matcher matcher = pattern.matcher(target);
797        
798        StringBuilder sb = new StringBuilder();
799        
800        int start = 0;
801        int end;
802        while (matcher.find()) {
803            end = matcher.start(1);
804            sb.append(target.substring(start, end));
805            sb.append(matcher.group(1).
806                replaceAll("\t", "-"));
807            start = matcher.end(1);
808        }
809        sb.append(target.substring(start, target.length()));
810        
811        return sb.toString();
812    }
813
814    /**
815     * If there exists reserved characters in value, like comma, the string need
816     * to be escape. However the escaping char is not store because it will be
817     * write back in the writer. Nested double quotes are not supported.
818     *
819     * @param value a {@link java.lang.String} object.
820     * @return a {@link java.lang.String} object.
821     */
822    public static String removeDoubleQuotes(String value) {
823        
824        if (value != null) {
825            int length;
826            int count;
827            
828            value = value.trim();
829            length = value.length();
830            
831            value = value.replace("\"", "");
832            count = length - value.length();
833            
834            if (isEmpty(value)) {
835                value = null;
836            }
837            
838            if (count > 2) {
839                LOGGER.warn(
840                    "Nested double quotes in value, " + count + " occurrences have been replaced.");
841            }
842        }
843        
844        return value;
845    }
846    
847}