Line data Source code
1 : // Copyright (C) 2016 The Android Open Source Project 2 : // 3 : // Licensed under the Apache License, Version 2.0 (the "License"); 4 : // you may not use this file except in compliance with the License. 5 : // You may obtain a copy of the License at 6 : // 7 : // http://www.apache.org/licenses/LICENSE-2.0 8 : // 9 : // Unless required by applicable law or agreed to in writing, software 10 : // distributed under the License is distributed on an "AS IS" BASIS, 11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 : // See the License for the specific language governing permissions and 13 : // limitations under the License. 14 : 15 : package com.google.gerrit.mail; 16 : 17 : import com.google.common.base.Strings; 18 : import com.google.common.collect.ImmutableSet; 19 : import com.google.common.collect.Iterators; 20 : import com.google.common.collect.PeekingIterator; 21 : import com.google.gerrit.entities.HumanComment; 22 : import java.util.ArrayList; 23 : import java.util.Collection; 24 : import java.util.List; 25 : import org.jsoup.Jsoup; 26 : import org.jsoup.nodes.Document; 27 : import org.jsoup.nodes.Element; 28 : 29 : /** Provides functionality for parsing the HTML part of a {@link MailMessage}. */ 30 : public class HtmlParser { 31 : 32 1 : private static final ImmutableSet<String> MAIL_PROVIDER_EXTRAS = 33 1 : ImmutableSet.of( 34 : "gmail_extra", // "On 01/01/2017 User<user@gmail.com> wrote:" 35 : "gmail_quote" // Used for quoting original content 36 : ); 37 : 38 1 : private static final ImmutableSet<String> ALLOWED_HTML_TAGS = 39 1 : ImmutableSet.of( 40 : "div", // Most user-typed comments are contained in a <div> tag 41 : "a", // We allow links to be contained in a comment 42 : "font" // Some email clients like nesting input in a new font tag 43 : ); 44 : 45 : private HtmlParser() {} 46 : 47 : /** 48 : * Parses comments from html email. 49 : * 50 : * <p>This parser goes though all html elements in the email and checks for matching patterns. It 51 : * keeps track of the last file and comments it encountered to know in which context a parsed 52 : * comment belongs. It uses the href attributes of <a> tags to identify comments sent out by 53 : * Gerrit as these are generally more reliable then the text captions. 54 : * 55 : * @param email the message as received from the email service 56 : * @param comments a specific set of comments as sent out in the original notification email. 57 : * Comments are expected to be in the same order as they were sent out to in the email. 58 : * @param changeUrl canonical change URL that points to the change on this Gerrit instance. 59 : * Example: https://go-review.googlesource.com/#/c/91570 60 : * @return list of MailComments parsed from the html part of the email 61 : */ 62 : public static List<MailComment> parse( 63 : MailMessage email, Collection<HumanComment> comments, String changeUrl) { 64 : // TODO(hiesel) Add support for Gmail Mobile 65 : // TODO(hiesel) Add tests for other popular email clients 66 : 67 : // This parser goes though all html elements in the email and checks for 68 : // matching patterns. It keeps track of the last file and comments it 69 : // encountered to know in which context a parsed comment belongs. 70 : // It uses the href attributes of <a> tags to identify comments sent out by 71 : // Gerrit as these are generally more reliable then the text captions. 72 1 : List<MailComment> parsedComments = new ArrayList<>(); 73 1 : Document d = Jsoup.parse(email.htmlContent()); 74 1 : PeekingIterator<HumanComment> iter = Iterators.peekingIterator(comments.iterator()); 75 : 76 1 : String lastEncounteredFileName = null; 77 1 : HumanComment lastEncounteredComment = null; 78 1 : for (Element e : d.body().getAllElements()) { 79 1 : String elementName = e.tagName(); 80 1 : boolean isInBlockQuote = 81 1 : e.parents().stream() 82 1 : .anyMatch( 83 : p -> 84 1 : p.tagName().equals("blockquote") 85 1 : || MAIL_PROVIDER_EXTRAS.contains(p.className())); 86 : 87 1 : if (elementName.equals("a")) { 88 1 : String href = e.attr("href"); 89 : // Check if there is still a next comment that could be contained in 90 : // this <a> tag 91 1 : if (!iter.hasNext()) { 92 1 : continue; 93 : } 94 1 : HumanComment perspectiveComment = iter.peek(); 95 1 : if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) { 96 1 : if (lastEncounteredFileName == null 97 1 : || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) { 98 : // Not a file-level comment, but users could have typed a comment 99 : // right after this file annotation to create a new file-level 100 : // comment. If this file has a file-level comment, we have already 101 : // set lastEncounteredComment to that file-level comment when we 102 : // encountered the file link and should not reset it now. 103 1 : lastEncounteredFileName = perspectiveComment.key.filename; 104 1 : lastEncounteredComment = null; 105 1 : } else if (perspectiveComment.lineNbr == 0) { 106 : // This was originally a file-level comment 107 1 : lastEncounteredComment = perspectiveComment; 108 1 : iter.next(); 109 : } 110 : continue; 111 1 : } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) { 112 : // This is a regular inline comment 113 1 : lastEncounteredComment = perspectiveComment; 114 1 : iter.next(); 115 1 : continue; 116 : } 117 : } 118 : 119 1 : if (isInBlockQuote) { 120 : // There is no user-input in quoted text 121 1 : continue; 122 : } 123 1 : if (!ALLOWED_HTML_TAGS.contains(elementName)) { 124 : // We only accept a set of allowed tags that can contain user input 125 1 : continue; 126 : } 127 1 : if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) { 128 : // We don't accept mailto: links in general as they often appear in reply-to lines 129 : // (User<user@gmail.com> wrote: ...) 130 1 : continue; 131 : } 132 : 133 : // This is a comment typed by the user 134 : // Replace non-breaking spaces and trim string 135 1 : String content = e.ownText().replace('\u00a0', ' ').trim(); 136 1 : boolean isLink = elementName.equals("a"); 137 1 : if (!Strings.isNullOrEmpty(content)) { 138 1 : if (lastEncounteredComment == null && lastEncounteredFileName == null) { 139 : // Remove quotation line, email signature and 140 : // "Sent from my xyz device" 141 1 : content = ParserUtil.trimQuotation(content); 142 : // TODO(hiesel) Add more sanitizer 143 1 : if (!Strings.isNullOrEmpty(content)) { 144 1 : ParserUtil.appendOrAddNewComment( 145 : new MailComment( 146 : content, null, null, MailComment.CommentType.PATCHSET_LEVEL, isLink), 147 : parsedComments); 148 : } 149 1 : } else if (lastEncounteredComment == null) { 150 1 : ParserUtil.appendOrAddNewComment( 151 : new MailComment( 152 : content, 153 : lastEncounteredFileName, 154 : null, 155 : MailComment.CommentType.FILE_COMMENT, 156 : isLink), 157 : parsedComments); 158 : } else { 159 1 : ParserUtil.appendOrAddNewComment( 160 : new MailComment( 161 : content, 162 : null, 163 : lastEncounteredComment, 164 : MailComment.CommentType.INLINE_COMMENT, 165 : isLink), 166 : parsedComments); 167 : } 168 : } 169 1 : } 170 1 : return parsedComments; 171 : } 172 : }