001/* 
002 * Copyright 2018 Leibniz-Institut für Analytische Wissenschaften – ISAS – e.V..
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package uk.ac.ebi.pride.jmztab2.model;
017
018import de.isas.mztab2.model.IndexedElement;
019import de.isas.mztab2.model.Metadata;
020import de.isas.mztab2.model.MsRun;
021import de.isas.mztab2.model.Parameter;
022import de.isas.mztab2.model.Publication;
023import de.isas.mztab2.model.PublicationItem;
024import de.isas.mztab2.model.SpectraRef;
025import java.net.URI;
026import java.net.URISyntaxException;
027import java.util.ArrayList;
028import java.util.Collections;
029import java.util.List;
030import java.util.regex.Matcher;
031import java.util.regex.Pattern;
032import java.util.stream.Collectors;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035import static uk.ac.ebi.pride.jmztab2.model.MZTabConstants.*;
036import static uk.ac.ebi.pride.jmztab2.model.MZTabStringUtils.*;
037import uk.ac.ebi.pride.jmztab2.utils.errors.FormatErrorType;
038import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabError;
039import uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException;
040import uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext;
041
042/**
043 * Provide a couple of functions for translating, parsing and printing formatted strings
044 * defined in the mzTab specification.
045 *
046 * @author qingwei
047 * @author nilshoffmann
048 * @since 30/01/13
049 *
050 */
051public class MZTabUtils {
052    
053    private static final Logger LOGGER = LoggerFactory.getLogger(
054        MZTabUtils.class);
055
056    /**
057     * If ratios are included and the denominator is zero, the "INF" value MUST
058     * be used. If the result leads to calculation errors (for example 0/0),
059     * this MUST be reported as "not a number" ("NaN").
060     *
061     * @see #parseDouble(String)
062     * @param value a {@link java.lang.Double} object.
063     * @return a {@link java.lang.String} object.
064     */
065    public static String printDouble(Double value) {
066        if (value == null) {
067            return NULL;
068        } else if (value.equals(Double.NaN)) {
069            return CALCULATE_ERROR;
070        } else if (value.equals(Double.POSITIVE_INFINITY)) {
071            return INFINITY;
072        } else {
073            return value.toString();
074        }
075    }
076
077    /**
078     * Parse the target string, and check is obey the email format or not. If
079     * not, return null.
080     *
081     * @param target a {@link java.lang.String} object.
082     * @return a {@link java.lang.String} object.
083     */
084    public static String parseEmail(String target) {
085        target = parseString(target);
086        if (target == null) {
087            return null;
088        }
089        
090        String regexp = REGEX_EMAIL;
091        Pattern pattern = Pattern.compile(regexp);
092        Matcher matcher = pattern.matcher(target);
093        
094        return matcher.find() ? target : null;
095    }
096
097    /**
098     * Parse the target string, and check it follows the mzTab Version format.
099     * If not, return null.
100     *
101     * @param target a {@link java.lang.String} object.
102     * @return a {@link java.lang.String} object.
103     */
104    public static String parseMzTabVersion(String target) {
105        target = parseString(target);
106        if (target == null) {
107            return null;
108        }
109        
110        Pattern versionPattern = Pattern.compile(MZTabConstants.REGEX_MZTAB_M);
111        Matcher m = versionPattern.matcher(target);
112        if (m.matches()) {
113            Integer major = Integer.parseInt(m.group("major"));
114            Integer minor = Integer.parseInt(m.group("minor"));
115            Integer micro = Integer.parseInt(m.group("micro"));
116            if (major != 2) {
117                return null;
118            }
119            if (!"M".equals(m.group("profile"))) {
120                return null;
121            }
122            return target;
123        }
124        return null;
125    }
126
127    /**
128     * Parameters are always reported as [CV label, accession, name, value]. Any
129     * field that is not available MUST be left empty.
130     *
131     * If the name or value of param contains comma, quotes MUST be added to
132     * avoid problems. Nested double quotes are not supported.
133     *
134     * Notice: name cell never set null.
135     *
136     * @param target a {@link java.lang.String} object.
137     * @return a {@link de.isas.mztab2.model.Parameter} object.
138     */
139    public static Parameter parseParam(String target) {
140        target = parseString(target);
141        if (target == null) {
142            return null;
143        }
144        
145        try {
146            target = target.substring(target.indexOf("[") + 1, target.
147                lastIndexOf("]"));
148            String[] tokens = target.split(REGEX_PARAM_SPLIT, -1);
149            
150            if (tokens.length == 4) {
151                String cvLabel = tokens[0].trim();
152                
153                String accession = tokens[1].trim();
154                
155                String name = tokens[2].trim();
156                if (name.contains("\"")) {  //We remove the escaping because it will be written back in the writer
157                    name = removeDoubleQuotes(name);
158                }
159                
160                if (isEmpty(name)) {
161                    return null;
162                }
163                
164                String value = tokens[3].trim();
165                if (value.contains("\"")) {  //We remove the escaping because it will be written back in the writer
166                    value = removeDoubleQuotes(value);
167                }
168                if (isEmpty(value)) {
169                    value = null;
170                }
171                
172                if (isEmpty(cvLabel) && isEmpty(accession)) {
173                    return new Parameter().name(name).
174                        value(value);
175                } else {
176                    return new Parameter().cvLabel(cvLabel).
177                        cvAccession(accession).
178                        name(name).
179                        value(value);
180                }
181            }
182        } catch (IndexOutOfBoundsException e) {
183            return null;
184        }
185        
186        return null;
187        
188    }
189
190    /**
191     * Multiple identifiers MUST be separated by splitChar.
192     *
193     * @param splitChar a char.
194     * @param target a {@link java.lang.String} object.
195     * @return a {@link java.util.List} object.
196     */
197    public static List<String> parseStringList(char splitChar, String target) {
198        List<String> list = new ArrayList<>(splitChar);
199        
200        target = parseString(target);
201        if (target == null) {
202            return list;
203        }
204
205        // regular express reserved keywords escape
206        StringBuilder sb = new StringBuilder();
207        switch (splitChar) {
208            case '.':
209            case '$':
210            case '^':
211            case '{':
212            case '}':
213            case '[':
214            case ']':
215            case '(':
216            case ')':
217            case '|':
218            case '*':
219            case '+':
220            case '?':
221            case '\\':
222                sb.append("\\").
223                    append(splitChar);
224                break;
225            default:
226                sb.append(splitChar);
227        }
228        
229        String[] items = target.split(sb.toString());
230        Collections.addAll(list, items);
231        
232        return list.stream().
233            map(value ->
234                value.trim()).
235            collect(Collectors.toList());
236    }
237
238    /**
239     * parse the target into a {@link de.isas.mztab2.model.IndexedElement}
240     * object.
241     *
242     * @param target a {@link java.lang.String} object.
243     * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement}
244     * object.
245     * @return a {@link de.isas.mztab2.model.IndexedElement} object.
246     */
247    public static IndexedElement parseIndexedElement(String target,
248        MetadataElement element) {
249        target = parseString(target);
250        if (target == null) {
251            return null;
252        }
253        
254        Pattern pattern = Pattern.compile(element + "\\[(\\d+)\\]");
255        Matcher matcher = pattern.matcher(target);
256        if (matcher.find()) {
257            Integer id = new Integer(matcher.group(1));
258            IndexedElement p = new IndexedElement().id(id);
259            p.elementType(element.getName());
260            return p;
261        } else {
262            return null;
263        }
264    }
265
266    /**
267     * Parse the target into a {@link de.isas.mztab2.model.IndexedElement} list.
268     *
269     * @param target a {@link java.lang.String} object.
270     * @param element a {@link uk.ac.ebi.pride.jmztab2.model.MetadataElement}
271     * object.
272     * @return a {@link java.util.List} object.
273     */
274    public static List<IndexedElement> parseRefList(String target,
275        MetadataElement element) {
276        List<String> list = parseStringList(MZTabConstants.COMMA, target);
277        
278        List<IndexedElement> indexedElementList = new ArrayList<>();
279        IndexedElement indexedElement;
280        for (String item : list) {
281            indexedElement = parseIndexedElement(item, element);
282            if (indexedElement == null) {
283                indexedElementList.clear();
284                return indexedElementList;
285            }
286            indexedElementList.add(indexedElement);
287        }
288        return indexedElementList;
289    }
290
291    /**
292     * A list of '|' separated parameters
293     *
294     * @param target a {@link java.lang.String} object.
295     * @return a {@link java.util.List} object.
296     */
297    public static List<Parameter> parseParamList(String target) {
298        List<String> list = parseStringList(BAR, target);
299        
300        Parameter param;
301        SplitList<Parameter> paramList = new SplitList<>(BAR);
302        for (String item : list) {
303            param = parseParam(item);
304            if (param == null) {
305                paramList.clear();
306                return paramList;
307            } else {
308                paramList.add(param);
309            }
310        }
311        
312        return paramList;
313    }
314
315    /**
316     * A '|' delimited list of GO accessions
317     *
318     * @param target a {@link java.lang.String} object.
319     * @return a {@link java.util.List} object.
320     */
321    public static List<String> parseGOTermList(String target) {
322        List<String> list = parseStringList(COMMA, target);
323        
324        List<String> goList = new SplitList<>(COMMA);
325        for (String item : list) {
326            item = parseString(item);
327            if (item.startsWith("GO:")) {
328                goList.add(item);
329            } else {
330                goList.clear();
331                break;
332            }
333        }
334        
335        return goList;
336    }
337
338    /**
339     * <p>
340     * parseInteger.</p>
341     *
342     * @param target a {@link java.lang.String} object.
343     * @return a {@link java.lang.Integer} object.
344     */
345    public static Integer parseInteger(String target) {
346        target = parseString(target);
347        if (target == null) {
348            return null;
349        }
350        
351        Integer integer;
352        
353        try {
354            integer = new Integer(target);
355        } catch (NumberFormatException e) {
356            integer = null;
357        }
358        
359        return integer;
360    }
361
362    /**
363     * NOTICE: If ratios are included and the denominator is zero, the "INF"
364     * value MUST be used. If the result leads to calculation errors (for
365     * example 0/0), this MUST be reported as "not a number" ("NaN").
366     *
367     * @param target a {@link java.lang.String} object.
368     * @return a {@link java.lang.Double} object.
369     */
370    public static Double parseDouble(String target) {
371        target = parseString(target);
372        if (target == null) {
373            return null;
374        }
375        
376        Double value;
377        try {
378            value = new Double(target);
379        } catch (NumberFormatException e) {
380            switch (target) {
381                case CALCULATE_ERROR:
382                    value = Double.NaN;
383                    break;
384                case INFINITY:
385                    value = Double.POSITIVE_INFINITY;
386                    break;
387                default:
388                    value = null;
389                    break;
390            }
391        }
392        
393        return value;
394    }
395
396    /**
397     * <p>
398     * parseLong.</p>
399     *
400     * @param target a {@link java.lang.String} object.
401     * @return a {@link java.lang.Long} object.
402     */
403    public static Long parseLong(String target) {
404        target = parseString(target);
405        if (target == null) {
406            return null;
407        }
408        
409        try {
410            return new Long(target);
411        } catch (NumberFormatException e) {
412            return null;
413        }
414    }
415
416    /**
417     * <p>
418     * parseDoubleList.</p>
419     *
420     * @param target a {@link java.lang.String} object.
421     * @return a {@link java.util.List} object.
422     */
423    public static List<Double> parseDoubleList(String target) {
424        List<String> list = parseStringList(BAR, target);
425        
426        Double value;
427        List<Double> valueList = new ArrayList<>(BAR);
428        for (String item : list) {
429            value = parseDouble(item);
430            if (value == null) {
431                valueList.clear();
432                break;
433            } else {
434                valueList.add(value);
435            }
436        }
437        
438        return valueList;
439    }
440
441    /**
442     * <p>
443     * parseIntegerList.</p>
444     *
445     * @param target a {@link java.lang.String} object.
446     * @return a {@link java.util.List} object.
447     */
448    public static List<Integer> parseIntegerList(String target) {
449        List<String> list = parseStringList(BAR, target);
450        
451        Integer value;
452        List<Integer> valueList = new ArrayList<>(BAR);
453        for (String item : list) {
454            value = parseInteger(item);
455            if (value == null) {
456                valueList.clear();
457                break;
458            } else {
459                valueList.add(value);
460            }
461        }
462        
463        return valueList;
464    }
465
466    /**
467     * <p>
468     * parseURI.</p>
469     *
470     * @param target a {@link java.lang.String} object.
471     * @return a {@link java.net.URI} object.
472     */
473    public static URI parseURI(String target) {
474        target = parseString(target);
475        if (target == null) {
476            return null;
477        }
478        
479        URI uri;
480        
481        try {
482            uri = new URI(target);
483        } catch (URISyntaxException e) {
484            uri = null;
485        }
486        
487        return uri;
488    }
489
490    /**
491     * A publication on this unit. PubMed ids must be prefixed by "pubmed:",
492     * DOIs by "doi:". Multiple identifiers MUST be separated by "|".
493     *
494     * @param publication a {@link de.isas.mztab2.model.Publication} object.
495     * @param lineNumber the line number while parsing.
496     * @param target a {@link java.lang.String} object.
497     * @return a {@link de.isas.mztab2.model.Publication} object.
498     * @throws uk.ac.ebi.pride.jmztab2.utils.errors.MZTabException in case of
499     * parsing or formatting issues of the publication string.
500     */
501    public static Publication parsePublicationItems(Publication publication,
502        int lineNumber, String target) throws MZTabException {
503        List<String> list = parseStringList(BAR, target);
504        
505        PublicationItem.TypeEnum type;
506        String accession;
507        PublicationItem item;
508        for (String pub : list) {
509            pub = parseString(pub).
510                toLowerCase();
511            if (pub == null) {
512                publication.getPublicationItems().
513                    clear();
514                return publication;
515            }
516            String[] items = pub.split("" + COLON);
517            if (items.length == 2) {
518                type = PublicationItem.TypeEnum.fromValue(items[0]);
519                if (type == null) {
520                    throw new MZTabException(new MZTabError(
521                        FormatErrorType.Publication, lineNumber, target, pub));
522                }
523                accession = items[1].trim();
524                item = new PublicationItem().type(type).
525                    accession(accession);
526                publication.addPublicationItemsItem(item);
527            } else {
528                throw new MZTabException(new MZTabError(
529                    FormatErrorType.Publication, lineNumber, target, pub));
530            }
531            
532        }
533        
534        return publication;
535    }
536
537    /**
538     * Parse a {@link de.isas.mztab2.model.SpectraRef} list.
539     *
540     * @param context a
541     * {@link uk.ac.ebi.pride.jmztab2.utils.parser.MZTabParserContext} object.
542     * @param metadata a {@link de.isas.mztab2.model.Metadata} object.
543     * @param target a {@link java.lang.String} object.
544     * @return a {@link java.util.List} object.
545     */
546    public static List<SpectraRef> parseSpectraRefList(
547        MZTabParserContext context, Metadata metadata, String target) {
548        List<String> list = parseStringList(BAR, target);
549        List<SpectraRef> refList = new ArrayList<>();
550        
551        Pattern pattern = Pattern.compile("ms_run\\[(\\d+)\\]:(.*)");
552        Matcher matcher;
553        Integer ms_file_id;
554        String reference;
555        SpectraRef ref;
556        for (String item : list) {
557            matcher = pattern.matcher(item.trim());
558            if (matcher.find()) {
559                ms_file_id = new Integer(matcher.group(1));
560                reference = matcher.group(2);
561                
562                MsRun msRun = context.getMsRunMap().
563                    get(ms_file_id);
564                if (msRun == null) {
565                    ref = null;
566                } else {
567                    ref = new SpectraRef().msRun(msRun).
568                        reference(reference);
569                }
570                
571                if (ref == null) {
572                    refList.clear();
573                    break;
574                } else {
575                    refList.add(ref);
576                }
577            }
578        }
579        
580        return refList;
581    }
582
583    /**
584     * Solve the conflict about minus char between modification position and
585     * CHEMMOD charge. For example: 13-CHEMMOD:-159
586     *
587     * @param target a {@link java.lang.String} object.
588     * @return a {@link java.lang.String} object.
589     */
590    public static String translateMinusToUnicode(String target) {
591        Pattern pattern = Pattern.compile("(CHEMMOD:.*)(-)(.*)");
592        Matcher matcher = pattern.matcher(target);
593        StringBuilder sb = new StringBuilder();
594        if (matcher.find()) {
595            sb.append(matcher.group(1));
596            sb.append("&minus;");
597            sb.append(matcher.group(3));
598            
599        } else {
600            sb.append(target);
601        }
602        return sb.toString();
603    }
604
605    /**
606     * <p>
607     * translateMinusInCVtoUnicode.</p>
608     *
609     * @param target a {@link java.lang.String} object.
610     * @return a {@link java.lang.String} object.
611     */
612    public static String translateMinusInCVtoUnicode(String target) {
613        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
614        Matcher matcher = pattern.matcher(target);
615        
616        StringBuilder sb = new StringBuilder();
617        
618        int start = 0;
619        int end;
620        while (matcher.find()) {
621            end = matcher.start(1);
622            sb.append(target.substring(start, end));
623            sb.append(matcher.group(1).
624                replaceAll("-", "&minus;"));
625            start = matcher.end(1);
626        }
627        sb.append(target.substring(start, target.length()));
628        
629        return sb.toString();
630    }
631
632    /**
633     * <p>
634     * translateUnicodeCVTermMinus.</p>
635     *
636     * @param target a {@link java.lang.String} object.
637     * @return a {@link java.lang.String} object.
638     */
639    public static String translateUnicodeCVTermMinus(String target) {
640        return target.replaceAll("&minus;", "-");
641    }
642
643    /**
644     * Solve the conflict about minus char between modification position and
645     * CHEMMOD charge. For example: 13-CHEMMOD:-159
646     *
647     * @param target a {@link java.lang.String} object.
648     * @return a {@link java.lang.String} object.
649     */
650    public static String translateUnicodeToMinus(String target) {
651        Pattern pattern = Pattern.compile("(.*CHEMMOD:.*)(&minus;)(.*)");
652        Matcher matcher = pattern.matcher(target);
653        if (matcher.find()) {
654            StringBuilder sb = new StringBuilder();
655            
656            sb.append(matcher.group(1));
657            sb.append("-");
658            sb.append(matcher.group(3));
659            
660            return sb.toString();
661        } else {
662            return target;
663        }
664    }
665
666    /**
667     * locate param label [label, accession, name, value], translate ',' to '\t'
668     *
669     * @param target a {@link java.lang.String} object.
670     * @return a {@link java.lang.String} object.
671     */
672    public static String translateCommaToTab(String target) {
673        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
674        Matcher matcher = pattern.matcher(target);
675        
676        StringBuilder sb = new StringBuilder();
677        
678        int start = 0;
679        int end;
680        while (matcher.find()) {
681            end = matcher.start(1);
682            sb.append(target.substring(start, end));
683            sb.append(matcher.group(1).
684                replaceAll(",", "\t"));
685            start = matcher.end(1);
686        }
687        sb.append(target.substring(start, target.length()));
688        
689        return sb.toString();
690    }
691
692    /**
693     * solve the conflict about comma char which used in split modification and
694     * split cv param components.
695     *
696     * @param target a {@link java.lang.String} object.
697     * @return a {@link java.lang.String} object.
698     */
699    public static String translateTabToComma(String target) {
700        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
701        Matcher matcher = pattern.matcher(target);
702        
703        StringBuilder sb = new StringBuilder();
704        
705        int start = 0;
706        int end;
707        while (matcher.find()) {
708            end = matcher.start(1);
709            sb.append(target.substring(start, end));
710            sb.append(matcher.group(1).
711                replaceAll("\t", ","));
712            start = matcher.end(1);
713        }
714        sb.append(target.substring(start, target.length()));
715        
716        return sb.toString();
717    }
718
719    //Solve the problem for Neutral losses in CvTerm format
720    /**
721     * <p>
722     * translateMinusToTab.</p>
723     *
724     * @param target a {@link java.lang.String} object.
725     * @return a {@link java.lang.String} object.
726     */
727    public static String translateMinusToTab(String target) {
728        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
729        Matcher matcher = pattern.matcher(target);
730        
731        StringBuilder sb = new StringBuilder();
732        
733        int start = 0;
734        int end;
735        while (matcher.find()) {
736            end = matcher.start(1);
737            sb.append(target.substring(start, end));
738            sb.append(matcher.group(1).
739                replaceAll("-", "\t"));
740            start = matcher.end(1);
741        }
742        sb.append(target.substring(start, target.length()));
743        
744        return sb.toString();
745        
746    }
747    
748    private static String replaceLast(String string, String toReplace,
749        String replacement) {
750        int pos = string.lastIndexOf(toReplace);
751        if (pos > -1) {
752            return string.substring(0, pos)
753                + replacement
754                + string.substring(pos + toReplace.length(), string.length());
755        }
756        return string;
757    }
758
759    /**
760     * <p>
761     * translateLastToTab.</p>
762     *
763     * @param target a {@link java.lang.String} object.
764     * @return a {@link java.lang.String} object.
765     */
766    public static String translateLastToTab(String target) {
767        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
768        Matcher matcher = pattern.matcher(target);
769        
770        StringBuilder sb = new StringBuilder();
771        
772        int start = 0;
773        int end;
774        while (matcher.find()) {
775            end = matcher.start(1);
776            sb.append(target.substring(start, end));
777            sb.append(replaceLast(matcher.group(1), "-", "\t"));
778            start = matcher.end(1);
779        }
780        sb.append(target.substring(start, target.length()));
781        
782        return sb.toString();
783        
784    }
785
786    /**
787     * solve the conflict about comma char which used in split modification and
788     * split cv param components.
789     *
790     * @param target a {@link java.lang.String} object.
791     * @return a {@link java.lang.String} object.
792     */
793    public static String translateTabToMinus(String target) {
794        Pattern pattern = Pattern.compile("\\[([^\\[\\]]+)\\]");
795        Matcher matcher = pattern.matcher(target);
796        
797        StringBuilder sb = new StringBuilder();
798        
799        int start = 0;
800        int end;
801        while (matcher.find()) {
802            end = matcher.start(1);
803            sb.append(target.substring(start, end));
804            sb.append(matcher.group(1).
805                replaceAll("\t", "-"));
806            start = matcher.end(1);
807        }
808        sb.append(target.substring(start, target.length()));
809        
810        return sb.toString();
811    }
812
813    /**
814     * If there exists reserved characters in value, like comma, the string need
815     * to be escape. However the escaping char is not store because it will be
816     * write back in the writer. Nested double quotes are not supported.
817     *
818     * @param value a {@link java.lang.String} object.
819     * @return a {@link java.lang.String} object.
820     */
821    public static String removeDoubleQuotes(String value) {
822        
823        if (value != null) {
824            int length;
825            int count;
826            
827            value = value.trim();
828            length = value.length();
829            
830            value = value.replace("\"", "");
831            count = length - value.length();
832            
833            if (isEmpty(value)) {
834                value = null;
835            }
836            
837            if (count > 2) {
838                LOGGER.warn(
839                    "Nested double quotes in value, " + count + " occurrences have been replaced.");
840            }
841        }
842        
843        return value;
844    }
845    
846}