clang-tools-extra/clangd/index/dex/Trigram.cpp

   1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "Trigram.h"
  10 #include "FuzzyMatch.h"
  11 #include "index/dex/Token.h"
  12 #include "llvm/ADT/ArrayRef.h"
  13 #include "llvm/ADT/DenseSet.h"
  14 #include "llvm/ADT/STLExtras.h"
  15 #include "llvm/ADT/SmallVector.h"
  16 #include "llvm/ADT/StringExtras.h"
  17 #include "llvm/ADT/StringRef.h"
  18 #include <cctype>
  19 #include <limits>
  20 #include <queue>
  21 #include <string>
  22 #include <utility>
  23
  24 namespace clang {
  25 namespace clangd {
  26 namespace dex {
  27
  28 // Produce trigrams (including duplicates) and pass them to Out().
  29 template <typename Func>
  30 static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
  31   assert(!Identifier.empty());
  32   // Apply fuzzy matching text segmentation.
  33   llvm::SmallVector<CharRole> Roles(Identifier.size());
  34   calculateRoles(Identifier,
  35                  llvm::MutableArrayRef(Roles.data(), Identifier.size()));
  36
  37   std::string LowercaseIdentifier = Identifier.lower();
  38
  39   // For each character, store indices of the characters to which fuzzy matching
  40   // algorithm can jump. There are 2 possible variants:
  41   //
  42   // * Next Tail - next character from the same segment
  43   // * Next Head - front character of the next segment
  44   //
  45   // Next stores tuples of three indices in the presented order, if a variant is
  46   // not available then 0 is stored.
  47   llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
  48       LowercaseIdentifier.size());
  49   unsigned NextTail = 0, NextHead = 0;
  50   for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
  51     Next[I] = {{NextTail, NextHead}};
  52     NextTail = Roles[I] == Tail ? I : 0;
  53     if (Roles[I] == Head) {
  54       NextHead = I;
  55     }
  56   }
  57
  58   // Iterate through valid sequences of three characters Fuzzy Matcher can
  59   // process.
  60   for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
  61     // Skip delimiters.
  62     if (Roles[I] != Head && Roles[I] != Tail)
  63       continue;
  64     for (unsigned J : Next[I]) {
  65       if (J == 0)
  66         continue;
  67       for (unsigned K : Next[J]) {
  68         if (K == 0)
  69           continue;
  70         Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
  71                     LowercaseIdentifier[K]));
  72       }
  73     }
  74   }
  75   // Short queries semantics are different. When the user dosn't type enough
  76   // symbols to form trigrams, we still want to serve meaningful results. To
  77   // achieve that, we form incomplete trigrams (bi- and unigrams) for the
  78   // identifiers and also generate short trigrams on the query side from what
  79   // is available. We allow a small number of short trigram types in order to
  80   // prevent excessive memory usage and increase the quality of the search.
  81   // Only the first few symbols are allowed to be used in incomplete trigrams.
  82   //
  83   // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
  84   // "_", "_a", "a", "ab", "ad", "d", "de", "dg"
  85   for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
  86     // The first symbol might be a separator, so the loop condition should be
  87     // stopping as soon as there is no next head or we have seen two heads.
  88     if (Roles[Position] == Head)
  89       ++HeadsSeen;
  90     Out(Trigram(LowercaseIdentifier[Position]));
  91     for (unsigned I : Next[Position])
  92       if (I != 0)
  93         Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
  94     Position = Next[Position][1];
  95     if (Position == 0)
  96       break;
  97   }
  98 }
  99
 100 void generateIdentifierTrigrams(llvm::StringRef Identifier,
 101                                 std::vector<Trigram> &Result) {
 102   // Empirically, scanning for duplicates is faster with fewer trigrams, and
 103   // a hashtable is faster with more. This is a hot path, so dispatch based on
 104   // expected number of trigrams. Longer identifiers produce more trigrams.
 105   // The magic number was tuned by running IndexBenchmark.DexBuild.
 106   constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
 107   Result.clear();
 108   if (Identifier.empty())
 109     return;
 110
 111   if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
 112     identifierTrigrams(Identifier, [&](Trigram T) {
 113       if (!llvm::is_contained(Result, T))
 114         Result.push_back(T);
 115     });
 116   } else {
 117     identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
 118     llvm::sort(Result);
 119     Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
 120   }
 121 }
 122
 123 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
 124   if (Query.empty())
 125     return {};
 126
 127   // Apply fuzzy matching text segmentation.
 128   llvm::SmallVector<CharRole> Roles(Query.size());
 129   calculateRoles(Query, llvm::MutableArrayRef(Roles.data(), Query.size()));
 130
 131   std::string LowercaseQuery = Query.lower();
 132
 133   llvm::DenseSet<Token> UniqueTrigrams;
 134   std::string Chars;
 135   for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
 136     if (Roles[I] != Head && Roles[I] != Tail)
 137       continue; // Skip delimiters.
 138     Chars.push_back(LowercaseQuery[I]);
 139     if (Chars.size() > 3)
 140       Chars.erase(Chars.begin());
 141     if (Chars.size() == 3)
 142       UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
 143   }
 144
 145   // For queries with very few letters, generateIdentifierTrigrams emulates
 146   // outputs of this function to match the semantics.
 147   if (UniqueTrigrams.empty()) {
 148     // If bigram can't be formed out of letters/numbers, we prepend separator.
 149     std::string Result(1, LowercaseQuery.front());
 150     for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
 151       if (Roles[I] == Head || Roles[I] == Tail)
 152         Result += LowercaseQuery[I];
 153     UniqueTrigrams.insert(
 154         Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
 155   }
 156
 157   return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
 158 }
 159
 160 } // namespace dex
 161 } // namespace clangd
 162 } // namespace clang