Line data Source code
1 : // Copyright (C) 2013 The Android Open Source Project 2 : // 3 : // Licensed under the Apache License, Version 2.0 (the "License"); 4 : // you may not use this file except in compliance with the License. 5 : // You may obtain a copy of the License at 6 : // 7 : // http://www.apache.org/licenses/LICENSE-2.0 8 : // 9 : // Unless required by applicable law or agreed to in writing, software 10 : // distributed under the License is distributed on an "AS IS" BASIS, 11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 : // See the License for the specific language governing permissions and 13 : // limitations under the License. 14 : 15 : package com.google.gerrit.index.query; 16 : 17 : import static com.google.common.collect.ImmutableSet.toImmutableSet; 18 : 19 : import com.google.common.base.CharMatcher; 20 : import com.google.common.base.Splitter; 21 : import com.google.common.collect.ImmutableSet; 22 : import com.google.common.primitives.Ints; 23 : import com.google.common.primitives.Longs; 24 : import com.google.gerrit.index.FieldType; 25 : import com.google.gerrit.index.SchemaFieldDefs.SchemaField; 26 : import java.util.Objects; 27 : import java.util.Set; 28 : import java.util.stream.StreamSupport; 29 : 30 : /** Predicate that is mapped to a field in the index. */ 31 : public abstract class IndexPredicate<I> extends OperatorPredicate<I> implements Matchable<I> { 32 : /** 33 : * Text segmentation to be applied to both the query string and the indexed field for full-text 34 : * queries. This is inspired by http://unicode.org/reports/tr29/ which is what Lucene uses, but 35 : * complexity was reduced to the bare minimum at the cost of small discrepancies to the Unicode 36 : * spec. 37 : */ 38 152 : private static final Splitter FULL_TEXT_SPLITTER = Splitter.on(CharMatcher.anyOf(" ,.-:\\/_=\n")); 39 : 40 : private final SchemaField<I, ?> def; 41 : 42 : protected IndexPredicate(SchemaField<I, ?> def, String value) { 43 152 : super(def.getName(), value); 44 152 : this.def = def; 45 152 : } 46 : 47 : protected IndexPredicate(SchemaField<I, ?> def, String name, String value) { 48 80 : super(name, value); 49 80 : this.def = def; 50 80 : } 51 : 52 : public SchemaField<I, ?> getField() { 53 151 : return def; 54 : } 55 : 56 : public FieldType<?> getType() { 57 7 : return def.getType(); 58 : } 59 : 60 : /** 61 : * This method matches documents without calling an index subsystem. For primitive fields (e.g. 62 : * integer, long) , the matching logic is consistent across this method and all known index 63 : * implementations. For text fields (i.e. prefix and full-text) the semantics vary between this 64 : * implementation and known index implementations: 65 : * <li>Prefix: Lucene as well as {@link #match(Object)} matches terms as true prefixes (prefix:foo 66 : * -> `foo bar` matches, but `baz foo bar` does not match). The index implementation at Google 67 : * tokenizes both the query and the indexed text and matches tokens individually (prefix:fo ba 68 : * -> `baz foo bar` matches). 69 : * <li>Full text: Lucene uses a {@code PhraseQuery} to search for terms in full text fields 70 : * in-order. The index implementation at Google as well as {@link #match(Object)} tokenizes 71 : * both the query and the indexed text and matches tokens individually. 72 : * 73 : * @return true if the predicate matches the provided {@code I}. 74 : */ 75 : @Override 76 : public boolean match(I doc) { 77 150 : if (getField().isRepeatable()) { 78 110 : Iterable<?> values = (Iterable<?>) getField().get(doc); 79 110 : for (Object v : values) { 80 91 : if (matchesSingleObject(v)) { 81 79 : return true; 82 : } 83 82 : } 84 109 : return false; 85 : } 86 149 : return matchesSingleObject(getField().get(doc)); 87 : } 88 : 89 : @Override 90 : public int getCost() { 91 121 : return 1; 92 : } 93 : 94 : private boolean matchesSingleObject(Object fieldValueFromObject) { 95 150 : String fieldTypeName = getField().getType().getName(); 96 150 : if (fieldTypeName.equals(FieldType.INTEGER.getName())) { 97 74 : return Objects.equals(fieldValueFromObject, Ints.tryParse(value)); 98 150 : } else if (fieldTypeName.equals(FieldType.EXACT.getName())) { 99 150 : return Objects.equals(fieldValueFromObject, value); 100 101 : } else if (fieldTypeName.equals(FieldType.LONG.getName())) { 101 0 : return Objects.equals(fieldValueFromObject, Longs.tryParse(value)); 102 101 : } else if (fieldTypeName.equals(FieldType.PREFIX.getName())) { 103 101 : return String.valueOf(fieldValueFromObject).startsWith(value); 104 14 : } else if (fieldTypeName.equals(FieldType.FULL_TEXT.getName())) { 105 14 : Set<String> tokenizedField = tokenizeString(String.valueOf(fieldValueFromObject)); 106 14 : Set<String> tokenizedValue = tokenizeString(value); 107 14 : return !tokenizedValue.isEmpty() && tokenizedField.containsAll(tokenizedValue); 108 0 : } else if (fieldTypeName.equals(FieldType.STORED_ONLY.getName())) { 109 0 : throw new IllegalStateException("can't filter for storedOnly field " + getField().getName()); 110 0 : } else if (fieldTypeName.equals(FieldType.TIMESTAMP.getName())) { 111 0 : throw new IllegalStateException("timestamp queries must be handled in subclasses"); 112 0 : } else if (fieldTypeName.equals(FieldType.INTEGER_RANGE.getName())) { 113 0 : throw new IllegalStateException("integer range queries must be handled in subclasses"); 114 : } else { 115 0 : throw new IllegalStateException("unrecognized field " + fieldTypeName); 116 : } 117 : } 118 : 119 : private static ImmutableSet<String> tokenizeString(String value) { 120 14 : return StreamSupport.stream(FULL_TEXT_SPLITTER.split(value.toLowerCase()).spliterator(), false) 121 14 : .filter(s -> !s.trim().isEmpty()) 122 14 : .collect(toImmutableSet()); 123 : } 124 : }