LCOV - code coverage report
Current view: top level - mail - HtmlParser.java (source / functions) Hit Total Coverage
Test: _coverage_report.dat Lines: 51 51 100.0 %
Date: 2022-11-19 15:00:39 Functions: 3 3 100.0 %

          Line data    Source code
       1             : // Copyright (C) 2016 The Android Open Source Project
       2             : //
       3             : // Licensed under the Apache License, Version 2.0 (the "License");
       4             : // you may not use this file except in compliance with the License.
       5             : // You may obtain a copy of the License at
       6             : //
       7             : // http://www.apache.org/licenses/LICENSE-2.0
       8             : //
       9             : // Unless required by applicable law or agreed to in writing, software
      10             : // distributed under the License is distributed on an "AS IS" BASIS,
      11             : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      12             : // See the License for the specific language governing permissions and
      13             : // limitations under the License.
      14             : 
      15             : package com.google.gerrit.mail;
      16             : 
      17             : import com.google.common.base.Strings;
      18             : import com.google.common.collect.ImmutableSet;
      19             : import com.google.common.collect.Iterators;
      20             : import com.google.common.collect.PeekingIterator;
      21             : import com.google.gerrit.entities.HumanComment;
      22             : import java.util.ArrayList;
      23             : import java.util.Collection;
      24             : import java.util.List;
      25             : import org.jsoup.Jsoup;
      26             : import org.jsoup.nodes.Document;
      27             : import org.jsoup.nodes.Element;
      28             : 
      29             : /** Provides functionality for parsing the HTML part of a {@link MailMessage}. */
      30             : public class HtmlParser {
      31             : 
      32           1 :   private static final ImmutableSet<String> MAIL_PROVIDER_EXTRAS =
      33           1 :       ImmutableSet.of(
      34             :           "gmail_extra", // "On 01/01/2017 User<user@gmail.com> wrote:"
      35             :           "gmail_quote" // Used for quoting original content
      36             :           );
      37             : 
      38           1 :   private static final ImmutableSet<String> ALLOWED_HTML_TAGS =
      39           1 :       ImmutableSet.of(
      40             :           "div", // Most user-typed comments are contained in a <div> tag
      41             :           "a", // We allow links to be contained in a comment
      42             :           "font" // Some email clients like nesting input in a new font tag
      43             :           );
      44             : 
      45             :   private HtmlParser() {}
      46             : 
      47             :   /**
      48             :    * Parses comments from html email.
      49             :    *
      50             :    * <p>This parser goes though all html elements in the email and checks for matching patterns. It
      51             :    * keeps track of the last file and comments it encountered to know in which context a parsed
      52             :    * comment belongs. It uses the href attributes of <a> tags to identify comments sent out by
      53             :    * Gerrit as these are generally more reliable then the text captions.
      54             :    *
      55             :    * @param email the message as received from the email service
      56             :    * @param comments a specific set of comments as sent out in the original notification email.
      57             :    *     Comments are expected to be in the same order as they were sent out to in the email.
      58             :    * @param changeUrl canonical change URL that points to the change on this Gerrit instance.
      59             :    *     Example: https://go-review.googlesource.com/#/c/91570
      60             :    * @return list of MailComments parsed from the html part of the email
      61             :    */
      62             :   public static List<MailComment> parse(
      63             :       MailMessage email, Collection<HumanComment> comments, String changeUrl) {
      64             :     // TODO(hiesel) Add support for Gmail Mobile
      65             :     // TODO(hiesel) Add tests for other popular email clients
      66             : 
      67             :     // This parser goes though all html elements in the email and checks for
      68             :     // matching patterns. It keeps track of the last file and comments it
      69             :     // encountered to know in which context a parsed comment belongs.
      70             :     // It uses the href attributes of <a> tags to identify comments sent out by
      71             :     // Gerrit as these are generally more reliable then the text captions.
      72           1 :     List<MailComment> parsedComments = new ArrayList<>();
      73           1 :     Document d = Jsoup.parse(email.htmlContent());
      74           1 :     PeekingIterator<HumanComment> iter = Iterators.peekingIterator(comments.iterator());
      75             : 
      76           1 :     String lastEncounteredFileName = null;
      77           1 :     HumanComment lastEncounteredComment = null;
      78           1 :     for (Element e : d.body().getAllElements()) {
      79           1 :       String elementName = e.tagName();
      80           1 :       boolean isInBlockQuote =
      81           1 :           e.parents().stream()
      82           1 :               .anyMatch(
      83             :                   p ->
      84           1 :                       p.tagName().equals("blockquote")
      85           1 :                           || MAIL_PROVIDER_EXTRAS.contains(p.className()));
      86             : 
      87           1 :       if (elementName.equals("a")) {
      88           1 :         String href = e.attr("href");
      89             :         // Check if there is still a next comment that could be contained in
      90             :         // this <a> tag
      91           1 :         if (!iter.hasNext()) {
      92           1 :           continue;
      93             :         }
      94           1 :         HumanComment perspectiveComment = iter.peek();
      95           1 :         if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) {
      96           1 :           if (lastEncounteredFileName == null
      97           1 :               || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) {
      98             :             // Not a file-level comment, but users could have typed a comment
      99             :             // right after this file annotation to create a new file-level
     100             :             // comment. If this file has a file-level comment, we have already
     101             :             // set lastEncounteredComment to that file-level comment when we
     102             :             // encountered the file link and should not reset it now.
     103           1 :             lastEncounteredFileName = perspectiveComment.key.filename;
     104           1 :             lastEncounteredComment = null;
     105           1 :           } else if (perspectiveComment.lineNbr == 0) {
     106             :             // This was originally a file-level comment
     107           1 :             lastEncounteredComment = perspectiveComment;
     108           1 :             iter.next();
     109             :           }
     110             :           continue;
     111           1 :         } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) {
     112             :           // This is a regular inline comment
     113           1 :           lastEncounteredComment = perspectiveComment;
     114           1 :           iter.next();
     115           1 :           continue;
     116             :         }
     117             :       }
     118             : 
     119           1 :       if (isInBlockQuote) {
     120             :         // There is no user-input in quoted text
     121           1 :         continue;
     122             :       }
     123           1 :       if (!ALLOWED_HTML_TAGS.contains(elementName)) {
     124             :         // We only accept a set of allowed tags that can contain user input
     125           1 :         continue;
     126             :       }
     127           1 :       if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) {
     128             :         // We don't accept mailto: links in general as they often appear in reply-to lines
     129             :         // (User<user@gmail.com> wrote: ...)
     130           1 :         continue;
     131             :       }
     132             : 
     133             :       // This is a comment typed by the user
     134             :       // Replace non-breaking spaces and trim string
     135           1 :       String content = e.ownText().replace('\u00a0', ' ').trim();
     136           1 :       boolean isLink = elementName.equals("a");
     137           1 :       if (!Strings.isNullOrEmpty(content)) {
     138           1 :         if (lastEncounteredComment == null && lastEncounteredFileName == null) {
     139             :           // Remove quotation line, email signature and
     140             :           // "Sent from my xyz device"
     141           1 :           content = ParserUtil.trimQuotation(content);
     142             :           // TODO(hiesel) Add more sanitizer
     143           1 :           if (!Strings.isNullOrEmpty(content)) {
     144           1 :             ParserUtil.appendOrAddNewComment(
     145             :                 new MailComment(
     146             :                     content, null, null, MailComment.CommentType.PATCHSET_LEVEL, isLink),
     147             :                 parsedComments);
     148             :           }
     149           1 :         } else if (lastEncounteredComment == null) {
     150           1 :           ParserUtil.appendOrAddNewComment(
     151             :               new MailComment(
     152             :                   content,
     153             :                   lastEncounteredFileName,
     154             :                   null,
     155             :                   MailComment.CommentType.FILE_COMMENT,
     156             :                   isLink),
     157             :               parsedComments);
     158             :         } else {
     159           1 :           ParserUtil.appendOrAddNewComment(
     160             :               new MailComment(
     161             :                   content,
     162             :                   null,
     163             :                   lastEncounteredComment,
     164             :                   MailComment.CommentType.INLINE_COMMENT,
     165             :                   isLink),
     166             :               parsedComments);
     167             :         }
     168             :       }
     169           1 :     }
     170           1 :     return parsedComments;
     171             :   }
     172             : }

Generated by: LCOV version 1.16+git.20220603.dfeb750