ICU 66.0.1  66.0.1
stringtriebuilder.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012,2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: stringtriebuilder.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010dec24
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __STRINGTRIEBUILDER_H__
18 #define __STRINGTRIEBUILDER_H__
19 
20 #include "unicode/utypes.h"
21 
22 #if U_SHOW_CPLUSPLUS_API
23 
24 #include "unicode/uobject.h"
25 
31 // Forward declaration.
33 struct UHashtable;
34 typedef struct UHashtable UHashtable;
36 
58 };
59 
60 U_NAMESPACE_BEGIN
61 
68 class U_COMMON_API StringTrieBuilder : public UObject {
69 public:
70 #ifndef U_HIDE_INTERNAL_API
71 
72  static int32_t hashNode(const void *node);
74  static UBool equalNodes(const void *left, const void *right);
75 #endif /* U_HIDE_INTERNAL_API */
76 
77 protected:
78  // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
79  // or else the compiler will create a public default constructor.
81  StringTrieBuilder();
83  virtual ~StringTrieBuilder();
84 
85 #ifndef U_HIDE_INTERNAL_API
86 
87  void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
89  void deleteCompactBuilder();
90 
92  void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
93 
95  int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
97  int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
98 #endif /* U_HIDE_INTERNAL_API */
99 
100  class Node;
101 
102 #ifndef U_HIDE_INTERNAL_API
103 
104  Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
106  Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
107  int32_t length, UErrorCode &errorCode);
108 #endif /* U_HIDE_INTERNAL_API */
109 
111  virtual int32_t getElementStringLength(int32_t i) const = 0;
113  virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
115  virtual int32_t getElementValue(int32_t i) const = 0;
116 
117  // Finds the first unit index after this one where
118  // the first and last element have different units again.
120  virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
121 
122  // Number of different units at unitIndex.
124  virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
126  virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
128  virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
129 
131  virtual UBool matchNodesCanHaveValues() const = 0;
132 
134  virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
136  virtual int32_t getMinLinearMatch() const = 0;
138  virtual int32_t getMaxLinearMatchLength() const = 0;
139 
140 #ifndef U_HIDE_INTERNAL_API
141  // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
143  static const int32_t kMaxBranchLinearSubNodeLength=5;
144 
145  // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
146  // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
148  static const int32_t kMaxSplitBranchLevels=14;
149 
160  Node *registerNode(Node *newNode, UErrorCode &errorCode);
171  Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
172 #endif /* U_HIDE_INTERNAL_API */
173 
174  /*
175  * C++ note:
176  * registerNode() and registerFinalValue() take ownership of their input nodes,
177  * and only return owned nodes.
178  * If they see a failure UErrorCode, they will delete the input node.
179  * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
180  * If there is a failure, they return NULL.
181  *
182  * NULL Node pointers can be safely passed into other Nodes because
183  * they call the static Node::hashCode() which checks for a NULL pointer first.
184  *
185  * Therefore, as long as builder functions register a new node,
186  * they need to check for failures only before explicitly dereferencing
187  * a Node pointer, or before setting a new UErrorCode.
188  */
189 
190  // Hash set of nodes, maps from nodes to integer 1.
192  UHashtable *nodes;
193 
194  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
195  // it is needed for layout of other objects.
200  class Node : public UObject {
201  public:
202  Node(int32_t initialHash) : hash(initialHash), offset(0) {}
203  inline int32_t hashCode() const { return hash; }
204  // Handles node==NULL.
205  static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
206  // Base class operator==() compares the actual class types.
207  virtual UBool operator==(const Node &other) const;
208  inline UBool operator!=(const Node &other) const { return !operator==(other); }
236  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
237  // write() must set the offset to a positive value.
238  virtual void write(StringTrieBuilder &builder) = 0;
239  // See markRightEdgesFirst.
240  inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
241  StringTrieBuilder &builder) {
242  // Note: Edge numbers are negative, lastRight<=firstRight.
243  // If offset>0 then this node and its sub-nodes have been written already
244  // and we need not write them again.
245  // If this node is part of the unwritten right branch edge,
246  // then we wait until that is written.
247  if(offset<0 && (offset<lastRight || firstRight<offset)) {
248  write(builder);
249  }
250  }
251  inline int32_t getOffset() const { return offset; }
252  protected:
253  int32_t hash;
254  int32_t offset;
255  };
256 
257 #ifndef U_HIDE_INTERNAL_API
258  // This class should not be overridden because
259  // registerFinalValue() compares a stack-allocated FinalValueNode
260  // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
261  // with the input node, and the
262  // !Node::operator==(other) used inside FinalValueNode::operator==(other)
263  // will be false if the typeid's are different.
265  class FinalValueNode : public Node {
266  public:
267  FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
268  virtual UBool operator==(const Node &other) const;
269  virtual void write(StringTrieBuilder &builder);
270  protected:
271  int32_t value;
272  };
273 #endif /* U_HIDE_INTERNAL_API */
274 
275  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
276  // it is needed for layout of other objects.
280  class ValueNode : public Node {
281  public:
282  ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
283  virtual UBool operator==(const Node &other) const;
284  void setValue(int32_t v) {
285  hasValue=TRUE;
286  value=v;
287  hash=hash*37u+v;
288  }
289  protected:
290  UBool hasValue;
291  int32_t value;
292  };
293 
294 #ifndef U_HIDE_INTERNAL_API
295 
298  class IntermediateValueNode : public ValueNode {
299  public:
300  IntermediateValueNode(int32_t v, Node *nextNode)
301  : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
302  virtual UBool operator==(const Node &other) const;
303  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
304  virtual void write(StringTrieBuilder &builder);
305  protected:
306  Node *next;
307  };
308 #endif /* U_HIDE_INTERNAL_API */
309 
310  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
311  // it is needed for layout of other objects.
315  class LinearMatchNode : public ValueNode {
316  public:
317  LinearMatchNode(int32_t len, Node *nextNode)
318  : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
319  length(len), next(nextNode) {}
320  virtual UBool operator==(const Node &other) const;
321  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
322  protected:
323  int32_t length;
324  Node *next;
325  };
326 
327 #ifndef U_HIDE_INTERNAL_API
328 
331  class BranchNode : public Node {
332  public:
333  BranchNode(int32_t initialHash) : Node(initialHash) {}
334  protected:
335  int32_t firstEdgeNumber;
336  };
337 
341  class ListBranchNode : public BranchNode {
342  public:
343  ListBranchNode() : BranchNode(0x444444), length(0) {}
344  virtual UBool operator==(const Node &other) const;
345  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
346  virtual void write(StringTrieBuilder &builder);
347  // Adds a unit with a final value.
348  void add(int32_t c, int32_t value) {
349  units[length]=(char16_t)c;
350  equal[length]=NULL;
351  values[length]=value;
352  ++length;
353  hash=(hash*37u+c)*37u+value;
354  }
355  // Adds a unit which leads to another match node.
356  void add(int32_t c, Node *node) {
357  units[length]=(char16_t)c;
358  equal[length]=node;
359  values[length]=0;
360  ++length;
361  hash=(hash*37u+c)*37u+hashCode(node);
362  }
363  protected:
364  Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
365  int32_t length;
366  int32_t values[kMaxBranchLinearSubNodeLength];
367  char16_t units[kMaxBranchLinearSubNodeLength];
368  };
369 
373  class SplitBranchNode : public BranchNode {
374  public:
375  SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
376  : BranchNode(((0x555555u*37u+middleUnit)*37u+
377  hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
378  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
379  virtual UBool operator==(const Node &other) const;
380  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
381  virtual void write(StringTrieBuilder &builder);
382  protected:
383  char16_t unit;
384  Node *lessThan;
385  Node *greaterOrEqual;
386  };
387 
388  // Branch head node, for writing the actual node lead unit.
390  class BranchHeadNode : public ValueNode {
391  public:
392  BranchHeadNode(int32_t len, Node *subNode)
393  : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
394  length(len), next(subNode) {}
395  virtual UBool operator==(const Node &other) const;
396  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
397  virtual void write(StringTrieBuilder &builder);
398  protected:
399  int32_t length;
400  Node *next; // A branch sub-node.
401  };
402 
403 #endif /* U_HIDE_INTERNAL_API */
404 
407  virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
408  Node *nextNode) const = 0;
409 
411  virtual int32_t write(int32_t unit) = 0;
413  virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
415  virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
417  virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
419  virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
420 };
421 
422 U_NAMESPACE_END
423 
424 #endif /* U_SHOW_CPLUSPLUS_API */
425 
426 #endif // __STRINGTRIEBUILDER_H__
struct UHashtable UHashtable
Definition: msgfmt.h:43
Builds a trie more slowly, attempting to generate a shorter but equivalent serialization.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:251
Builds a trie quickly.
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:265
C++ API: Common ICU base class UObject.
UStringTrieBuildOption
Build options for BytesTrieBuilder and CharsTrieBuilder.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:269
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261