Line data Source code
1 : // Copyright (C) 2020 The Android Open Source Project 2 : // 3 : // Licensed under the Apache License, Version 2.0 (the "License"); 4 : // you may not use this file except in compliance with the License. 5 : // You may obtain a copy of the License at 6 : // 7 : // http://www.apache.org/licenses/LICENSE-2.0 8 : // 9 : // Unless required by applicable law or agreed to in writing, software 10 : // distributed under the License is distributed on an "AS IS" BASIS, 11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 : // See the License for the specific language governing permissions and 13 : // limitations under the License. 14 : 15 : package com.google.gerrit.server.patch.gitfilediff; 16 : 17 : import static java.nio.charset.StandardCharsets.UTF_8; 18 : 19 : import com.google.common.collect.ImmutableList; 20 : import com.google.gerrit.entities.Patch; 21 : import com.google.gerrit.entities.Patch.PatchType; 22 : import java.util.Optional; 23 : import org.eclipse.jgit.patch.CombinedFileHeader; 24 : import org.eclipse.jgit.patch.FileHeader; 25 : import org.eclipse.jgit.util.IntList; 26 : import org.eclipse.jgit.util.RawParseUtils; 27 : 28 : /** A utility class for the {@link FileHeader} JGit object */ 29 0 : public class FileHeaderUtil { 30 104 : private static final Byte NUL = '\0'; 31 : 32 : /** 33 : * The maximum number of characters to lookup in the binary file {@link FileHeader}. This is used 34 : * to scan the file header for the occurrence of the {@link #NUL} character. 35 : * 36 : * <p>This limit assumes a uniform distribution of all characters, hence the probability of the 37 : * occurrence of each character = (1 / 256). We want to find the limit that makes the prob. of 38 : * finding {@link #NUL} > 0.999. 1 - (255 / 256) ^ N > 0.999 yields N = 1766. We set the limit to 39 : * this value multiplied by 10 for more confidence. 40 : */ 41 : private static final int BIN_FILE_MAX_SCAN_LIMIT = 20000; 42 : 43 : /** Converts the {@link FileHeader} parameter to a String representation. */ 44 : static String toString(FileHeader header) { 45 104 : return new String(FileHeaderUtil.toByteArray(header), UTF_8); 46 : } 47 : 48 : /** Converts the {@link FileHeader} parameter to a byte array. */ 49 : static byte[] toByteArray(FileHeader header) { 50 104 : int end = getEndOffset(header); 51 104 : if (header.getStartOffset() == 0 && end == header.getBuffer().length) { 52 104 : return header.getBuffer(); 53 : } 54 : 55 0 : final byte[] buf = new byte[end - header.getStartOffset()]; 56 0 : System.arraycopy(header.getBuffer(), header.getStartOffset(), buf, 0, buf.length); 57 0 : return buf; 58 : } 59 : 60 : /** Splits the {@code FileHeader} string to a list of strings, one string per header line. */ 61 : public static ImmutableList<String> getHeaderLines(FileHeader fileHeader) { 62 104 : String fileHeaderString = toString(fileHeader); 63 104 : return getHeaderLines(fileHeaderString); 64 : } 65 : 66 : public static ImmutableList<String> getHeaderLines(String header) { 67 104 : return getHeaderLines(header.getBytes(UTF_8)); 68 : } 69 : 70 : static ImmutableList<String> getHeaderLines(byte[] header) { 71 104 : final IntList lineStartOffsets = RawParseUtils.lineMap(header, 0, header.length); 72 104 : final ImmutableList.Builder<String> headerLines = 73 104 : ImmutableList.builderWithExpectedSize(lineStartOffsets.size() - 1); 74 104 : for (int i = 1; i < lineStartOffsets.size() - 1; i++) { 75 104 : final int b = lineStartOffsets.get(i); 76 104 : int e = lineStartOffsets.get(i + 1); 77 104 : if (header[e - 1] == '\n') { 78 104 : e--; 79 : } 80 104 : headerLines.add(RawParseUtils.decode(UTF_8, header, b, e)); 81 : } 82 104 : return headerLines.build(); 83 : } 84 : 85 : /** 86 : * Returns the old file path associated with the {@link FileHeader}, or empty if the file is 87 : * {@link com.google.gerrit.entities.Patch.ChangeType#ADDED} or {@link 88 : * com.google.gerrit.entities.Patch.ChangeType#REWRITE}. 89 : */ 90 : public static Optional<String> getOldPath(FileHeader header) { 91 104 : Patch.ChangeType changeType = getChangeType(header); 92 104 : switch (changeType) { 93 : case DELETED: 94 : case COPIED: 95 : case RENAMED: 96 : case MODIFIED: 97 54 : return Optional.of(header.getOldPath()); 98 : 99 : case ADDED: 100 : case REWRITE: 101 104 : return Optional.empty(); 102 : } 103 0 : return Optional.empty(); 104 : } 105 : 106 : /** 107 : * Returns the new file path associated with the {@link FileHeader}, or empty if the file is 108 : * {@link com.google.gerrit.entities.Patch.ChangeType#DELETED}. 109 : */ 110 : public static Optional<String> getNewPath(FileHeader header) { 111 104 : Patch.ChangeType changeType = getChangeType(header); 112 104 : switch (changeType) { 113 : case DELETED: 114 23 : return Optional.empty(); 115 : 116 : case ADDED: 117 : case MODIFIED: 118 : case REWRITE: 119 : case COPIED: 120 : case RENAMED: 121 104 : return Optional.of(header.getNewPath()); 122 : } 123 0 : return Optional.empty(); 124 : } 125 : 126 : /** Returns the change type associated with the file header. */ 127 : public static Patch.ChangeType getChangeType(FileHeader header) { 128 : // In Gerrit, we define our own entities of the JGit entities, so that we have full control 129 : // over their behaviors (e.g. making sure that these entities are immutable so that we can add 130 : // them as fields of keys / values of persisted caches). 131 : 132 : // TODO(ghareeb): remove the dead code of the value REWRITE and all its handling 133 104 : switch (header.getChangeType()) { 134 : case ADD: 135 104 : return Patch.ChangeType.ADDED; 136 : case MODIFY: 137 53 : return Patch.ChangeType.MODIFIED; 138 : case DELETE: 139 23 : return Patch.ChangeType.DELETED; 140 : case RENAME: 141 12 : return Patch.ChangeType.RENAMED; 142 : case COPY: 143 4 : return Patch.ChangeType.COPIED; 144 : default: 145 0 : throw new IllegalArgumentException("Unsupported type " + header.getChangeType()); 146 : } 147 : } 148 : 149 : public static PatchType getPatchType(FileHeader header) { 150 : PatchType patchType; 151 : 152 104 : switch (header.getPatchType()) { 153 : case UNIFIED: 154 104 : patchType = Patch.PatchType.UNIFIED; 155 104 : break; 156 : case GIT_BINARY: 157 : case BINARY: 158 2 : patchType = Patch.PatchType.BINARY; 159 2 : break; 160 : default: 161 0 : throw new IllegalArgumentException("Unsupported type " + header.getPatchType()); 162 : } 163 : 164 104 : if (patchType != PatchType.BINARY) { 165 104 : byte[] buf = header.getBuffer(); 166 : // TODO(ghareeb): should we adjust the max limit threshold? 167 : // JGit sometimes misses the detection of binary files. In this case we look into the file 168 : // header for the occurrence of NUL characters, which is a definite signal that the file is 169 : // binary. We limit the number of characters to lookup to avoid performance bottlenecks. 170 104 : for (int ptr = header.getStartOffset(); 171 104 : ptr < Math.min(header.getEndOffset(), BIN_FILE_MAX_SCAN_LIMIT); 172 104 : ptr++) { 173 104 : if (buf[ptr] == NUL) { 174 : // It's really binary, but Git couldn't see the nul early enough to realize its binary, 175 : // and instead produced the diff. 176 : // 177 : // Force it to be a binary; it really should have been that. 178 0 : return PatchType.BINARY; 179 : } 180 : } 181 : } 182 104 : return patchType; 183 : } 184 : 185 : /** 186 : * Returns the end offset of the diff header line of the {@code FileHeader parameter} before the 187 : * appearance of any file edits (diff hunks). 188 : */ 189 : private static int getEndOffset(FileHeader fileHeader) { 190 104 : if (fileHeader instanceof CombinedFileHeader) { 191 0 : return fileHeader.getEndOffset(); 192 : } 193 104 : if (!fileHeader.getHunks().isEmpty()) { 194 104 : return fileHeader.getHunks().get(0).getStartOffset(); 195 : } 196 0 : return fileHeader.getEndOffset(); 197 : } 198 : }