File size: 6,441 Bytes

d552aac

// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

include "expression.fbs";
include "buffer.fbs";
include "language-tag.fbs";

// The terminal rules map as sorted strings table.
// The sorted terminal strings table is represented as offsets into the
// global strings pool, this allows to save memory between localized
// rules sets.
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
table TerminalRulesMap {
  // The offsets into the terminals pool.
  terminal_offsets:[uint];

  // The lhs set associated with a terminal rule.
  // This is an offset into the (deduplicated) global `lhs_set` vector.
  lhs_set_index:[uint];

  // Bounds the lengths of the terminal strings for quick early lookup
  // abort.
  min_terminal_length:int;

  max_terminal_length:int;
}

namespace libtextclassifier3.grammar.RulesSet_.Rules_;
struct UnaryRulesEntry {
  key:uint (key);
  value:uint;
}

// One key, value pair entry in the binary rules hash map.
// The key is a pair of nonterminals and the value the index of the lhs set.
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
struct BinaryRule {
  // The two rhs nonterminals.
  rhs_first:uint;

  rhs_second:uint;

  // The lhs set associated with this binary rule.
  // This is an offset into the (deduplicated) global `lhs_set` vector.
  lhs_set_index:uint;
}

// One bucket in the binary rule hash map that contains all entries for a
// given hash value.
namespace libtextclassifier3.grammar.RulesSet_.Rules_;
table BinaryRuleTableBucket {
  rules:[BinaryRule];
}

namespace libtextclassifier3.grammar.RulesSet_;
table Rules {
  // The locale this rule set applies to.
  locale:[LanguageTag];

  terminal_rules:Rules_.TerminalRulesMap;
  lowercase_terminal_rules:Rules_.TerminalRulesMap;

  // The unary rules map.
  // This is a map from a nonterminal to an lhs set index into the
  // (deduplicated) global `lhs_set` vector.
  unary_rules:[Rules_.UnaryRulesEntry];

  // The binary rules (hash) map.
  // This is a map from nonterminal pair to an lhs set index into the
  // (deduplicated) global `lhs_set` vector.
  binary_rules:[Rules_.BinaryRuleTableBucket];
}

// A set of lhs nonterminals associated with a rule match.
// Most commonly, that is just the id of the lhs nonterminal of the rule that
// is triggered, in this case `lhs` is set to the id of the nonterminal.
// If a callback needs to be triggered, lhs is the (negated) index into the
// `lhs` vector below that specifies additionally to the nonterminal, also the
// callback and parameter to call.
namespace libtextclassifier3.grammar.RulesSet_;
table LhsSet {
  lhs:[int];
}

namespace libtextclassifier3.grammar.RulesSet_;
struct Lhs {
  // The lhs nonterminal.
  nonterminal:uint;

  // The id of the callback to trigger.
  callback_id:uint;

  // A parameter to pass when invoking the callback.
  callback_param:ulong;

  // The maximum amount of whitespace allowed between the two parts.
  // A value of -1 allows for unbounded whitespace.
  max_whitespace_gap:byte;
}

namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_;
table AnnotationNtEntry {
  key:string (key);
  value:int;
}

// Usage of pre-defined non-terminals that the lexer can generate if used by
// the grammar.
namespace libtextclassifier3.grammar.RulesSet_;
table Nonterminals {
  // Id of the nonterminal indicating the start of input.
  start_nt:int;

  // Id of the nonterminal indicating the end of input.
  end_nt:int;

  // Id of the nonterminal indicating a token.
  token_nt:int;

  // Id of the nonterminal indicating a string of digits.
  digits_nt:int;

  // `n_digits_nt[k]` is the id of the nonterminal indicating a string of
  // `k` digits.
  n_digits_nt:[int];

  // Id of the nonterminal indicating a word or token boundary.
  wordbreak_nt:int;

  // Id of the nonterminal indicating an uppercase token.
  uppercase_token_nt:int;

  // Predefined nonterminals for annotations.
  // Maps annotation/collection names to non-terminal ids.
  annotation_nt:[Nonterminals_.AnnotationNtEntry];
}

namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_;
table NonterminalNamesEntry {
  key:int (key);
  value:string;
}

// Debug information for e.g. printing parse trees and show match
// information.
namespace libtextclassifier3.grammar.RulesSet_;
table DebugInformation {
  nonterminal_names:[DebugInformation_.NonterminalNamesEntry];
}

// Regex annotators.
namespace libtextclassifier3.grammar.RulesSet_;
table RegexAnnotator {
  // The pattern to run.
  pattern:string;

  compressed_pattern:CompressedBuffer;

  // The nonterminal to trigger.
  nonterminal:uint;
}

// Context free grammar rules representation.
// Rules are represented in (mostly) Chomsky Normal Form, where all rules are
// of the following form, either:
// * <nonterm> ::= term
// * <nonterm> ::= <nonterm>
// * <nonterm> ::= <nonterm> <nonterm>
// The `terminals`, `unary_rules` and `binary_rules` maps below represent
// these sets of rules.
namespace libtextclassifier3.grammar;
table RulesSet {
  rules:[RulesSet_.Rules];
  lhs_set:[RulesSet_.LhsSet];
  lhs:[RulesSet_.Lhs];

  // Terminals string pool.
  // The strings are zero-byte delimited and offset indexed by
  // `terminal_offsets` in the terminals rules map.
  terminals:string;

  nonterminals:RulesSet_.Nonterminals;
  reserved_6:int16 (deprecated);
  debug_information:RulesSet_.DebugInformation;
  regex_annotator:[RulesSet_.RegexAnnotator];

  // If true, will compile the regexes only on first use.
  lazy_regex_compilation:bool;

  // The semantic expressions associated with rule matches.
  semantic_expression:[SemanticExpression];

  // The schema defining the semantic results.
  semantic_values_schema:[ubyte];
}