ICU 65.1  65.1
ucharstrie.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstrie.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __UCHARSTRIE_H__
18 #define __UCHARSTRIE_H__
19 
26 #include "unicode/utypes.h"
27 
28 #if U_SHOW_CPLUSPLUS_API
29 
30 #include "unicode/unistr.h"
31 #include "unicode/uobject.h"
32 #include "unicode/ustringtrie.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 class Appendable;
37 class UCharsTrieBuilder;
38 class UVector32;
39 
54 public:
70  : ownedArray_(NULL), uchars_(trieUChars),
71  pos_(uchars_), remainingMatchLength_(-1) {}
72 
77  ~UCharsTrie();
78 
85  UCharsTrie(const UCharsTrie &other)
86  : ownedArray_(NULL), uchars_(other.uchars_),
87  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
88 
95  pos_=uchars_;
96  remainingMatchLength_=-1;
97  return *this;
98  }
99 
100 #ifndef U_HIDE_DRAFT_API
101 
109  uint64_t getState64() const {
110  return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
111  (uint64_t)(pos_ - uchars_);
112  }
113 
128  UCharsTrie &resetToState64(uint64_t state) {
129  remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
130  pos_ = uchars_ + (state & kState64PosMask);
131  return *this;
132  }
133 #endif /* U_HIDE_DRAFT_API */
134 
140  class State : public UMemory {
141  public:
146  State() { uchars=NULL; }
147  private:
148  friend class UCharsTrie;
149 
150  const char16_t *uchars;
151  const char16_t *pos;
152  int32_t remainingMatchLength;
153  };
154 
162  const UCharsTrie &saveState(State &state) const {
163  state.uchars=uchars_;
164  state.pos=pos_;
165  state.remainingMatchLength=remainingMatchLength_;
166  return *this;
167  }
168 
179  UCharsTrie &resetToState(const State &state) {
180  if(uchars_==state.uchars && uchars_!=NULL) {
181  pos_=state.pos;
182  remainingMatchLength_=state.remainingMatchLength;
183  }
184  return *this;
185  }
186 
193  UStringTrieResult current() const;
194 
202  inline UStringTrieResult first(int32_t uchar) {
203  remainingMatchLength_=-1;
204  return nextImpl(uchars_, uchar);
205  }
206 
215  UStringTrieResult firstForCodePoint(UChar32 cp);
216 
223  UStringTrieResult next(int32_t uchar);
224 
232  UStringTrieResult nextForCodePoint(UChar32 cp);
233 
249  UStringTrieResult next(ConstChar16Ptr s, int32_t length);
250 
260  inline int32_t getValue() const {
261  const char16_t *pos=pos_;
262  int32_t leadUnit=*pos++;
263  // U_ASSERT(leadUnit>=kMinValueLead);
264  return leadUnit&kValueIsFinal ?
265  readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
266  }
267 
277  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
278  const char16_t *pos=pos_;
279  // Skip the rest of a pending linear-match node.
280  return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
281  }
282 
290  int32_t getNextUChars(Appendable &out) const;
291 
296  class U_COMMON_API Iterator : public UMemory {
297  public:
309  Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
310 
322  Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
323 
328  ~Iterator();
329 
335  Iterator &reset();
336 
341  UBool hasNext() const;
342 
357  UBool next(UErrorCode &errorCode);
358 
363  const UnicodeString &getString() const { return str_; }
368  int32_t getValue() const { return value_; }
369 
370  private:
371  UBool truncateAndStop() {
372  pos_=NULL;
373  value_=-1; // no real value for str
374  return TRUE;
375  }
376 
377  const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
378 
379  const char16_t *uchars_;
380  const char16_t *pos_;
381  const char16_t *initialPos_;
382  int32_t remainingMatchLength_;
383  int32_t initialRemainingMatchLength_;
384  UBool skipValue_; // Skip intermediate value which was already delivered.
385 
386  UnicodeString str_;
387  int32_t maxLength_;
388  int32_t value_;
389 
390  // The stack stores pairs of integers for backtracking to another
391  // outbound edge of a branch node.
392  // The first integer is an offset from uchars_.
393  // The second integer has the str_.length() from before the node in bits 15..0,
394  // and the remaining branch length in bits 31..16.
395  // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
396  // but the code looks more confusing that way.)
397  UVector32 *stack_;
398  };
399 
400 private:
401  friend class UCharsTrieBuilder;
402 
409  UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
410  : ownedArray_(adoptUChars), uchars_(trieUChars),
411  pos_(uchars_), remainingMatchLength_(-1) {}
412 
413  // No assignment operator.
414  UCharsTrie &operator=(const UCharsTrie &other);
415 
416  inline void stop() {
417  pos_=NULL;
418  }
419 
420  // Reads a compact 32-bit integer.
421  // pos is already after the leadUnit, and the lead unit has bit 15 reset.
422  static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
423  int32_t value;
424  if(leadUnit<kMinTwoUnitValueLead) {
425  value=leadUnit;
426  } else if(leadUnit<kThreeUnitValueLead) {
427  value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
428  } else {
429  value=(pos[0]<<16)|pos[1];
430  }
431  return value;
432  }
433  static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
434  if(leadUnit>=kMinTwoUnitValueLead) {
435  if(leadUnit<kThreeUnitValueLead) {
436  ++pos;
437  } else {
438  pos+=2;
439  }
440  }
441  return pos;
442  }
443  static inline const char16_t *skipValue(const char16_t *pos) {
444  int32_t leadUnit=*pos++;
445  return skipValue(pos, leadUnit&0x7fff);
446  }
447 
448  static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
449  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
450  int32_t value;
451  if(leadUnit<kMinTwoUnitNodeValueLead) {
452  value=(leadUnit>>6)-1;
453  } else if(leadUnit<kThreeUnitNodeValueLead) {
454  value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
455  } else {
456  value=(pos[0]<<16)|pos[1];
457  }
458  return value;
459  }
460  static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
461  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
462  if(leadUnit>=kMinTwoUnitNodeValueLead) {
463  if(leadUnit<kThreeUnitNodeValueLead) {
464  ++pos;
465  } else {
466  pos+=2;
467  }
468  }
469  return pos;
470  }
471 
472  static inline const char16_t *jumpByDelta(const char16_t *pos) {
473  int32_t delta=*pos++;
474  if(delta>=kMinTwoUnitDeltaLead) {
475  if(delta==kThreeUnitDeltaLead) {
476  delta=(pos[0]<<16)|pos[1];
477  pos+=2;
478  } else {
479  delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
480  }
481  }
482  return pos+delta;
483  }
484 
485  static const char16_t *skipDelta(const char16_t *pos) {
486  int32_t delta=*pos++;
487  if(delta>=kMinTwoUnitDeltaLead) {
488  if(delta==kThreeUnitDeltaLead) {
489  pos+=2;
490  } else {
491  ++pos;
492  }
493  }
494  return pos;
495  }
496 
497  static inline UStringTrieResult valueResult(int32_t node) {
499  }
500 
501  // Handles a branch node for both next(uchar) and next(string).
502  UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
503 
504  // Requires remainingLength_<0.
505  UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
506 
507  // Helper functions for hasUniqueValue().
508  // Recursively finds a unique value (or whether there is not a unique one)
509  // from a branch.
510  static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
511  UBool haveUniqueValue, int32_t &uniqueValue);
512  // Recursively finds a unique value (or whether there is not a unique one)
513  // starting from a position on a node lead unit.
514  static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
515 
516  // Helper functions for getNextUChars().
517  // getNextUChars() when pos is on a branch node.
518  static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
519 
520  // UCharsTrie data structure
521  //
522  // The trie consists of a series of char16_t-serialized nodes for incremental
523  // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
524  // The root node is at the beginning of the trie data.
525  //
526  // Types of nodes are distinguished by their node lead unit ranges.
527  // After each node, except a final-value node, another node follows to
528  // encode match values or continue matching further units.
529  //
530  // Node types:
531  // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
532  // The value is for the string/char16_t sequence so far.
533  // - Match node, optionally with an intermediate value in a different compact format.
534  // The value, if present, is for the string/char16_t sequence so far.
535  //
536  // Aside from the value, which uses the node lead unit's high bits:
537  //
538  // - Linear-match node: Matches a number of units.
539  // - Branch node: Branches to other nodes according to the current input unit.
540  // The node unit is the length of the branch (number of units to select from)
541  // minus 1. It is followed by a sub-node:
542  // - If the length is at most kMaxBranchLinearSubNodeLength, then
543  // there are length-1 (key, value) pairs and then one more comparison unit.
544  // If one of the key units matches, then the value is either a final value for
545  // the string so far, or a "jump" delta to the next node.
546  // If the last unit matches, then matching continues with the next node.
547  // (Values have the same encoding as final-value nodes.)
548  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
549  // there is one unit and one "jump" delta.
550  // If the input unit is less than the sub-node unit, then "jump" by delta to
551  // the next sub-node which will have a length of length/2.
552  // (The delta has its own compact encoding.)
553  // Otherwise, skip the "jump" delta to the next sub-node
554  // which will have a length of length-length/2.
555 
556  // Match-node lead unit values, after masking off intermediate-value bits:
557 
558  // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
559  // the length is one more than the next unit.
560 
561  // For a branch sub-node with at most this many entries, we drop down
562  // to a linear search.
563  static const int32_t kMaxBranchLinearSubNodeLength=5;
564 
565  // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
566  static const int32_t kMinLinearMatch=0x30;
567  static const int32_t kMaxLinearMatchLength=0x10;
568 
569  // Match-node lead unit bits 14..6 for the optional intermediate value.
570  // If these bits are 0, then there is no intermediate value.
571  // Otherwise, see the *NodeValue* constants below.
572  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
573  static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
574 
575  // A final-value node has bit 15 set.
576  static const int32_t kValueIsFinal=0x8000;
577 
578  // Compact value: After testing and masking off bit 15, use the following thresholds.
579  static const int32_t kMaxOneUnitValue=0x3fff;
580 
581  static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
582  static const int32_t kThreeUnitValueLead=0x7fff;
583 
584  static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
585 
586  // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
587  static const int32_t kMaxOneUnitNodeValue=0xff;
588  static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
589  static const int32_t kThreeUnitNodeValueLead=0x7fc0;
590 
591  static const int32_t kMaxTwoUnitNodeValue=
592  ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
593 
594  // Compact delta integers.
595  static const int32_t kMaxOneUnitDelta=0xfbff;
596  static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
597  static const int32_t kThreeUnitDeltaLead=0xffff;
598 
599  static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
600 
601  // For getState64():
602  // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
603  // so we need at least 5 bits for that.
604  // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
605  static constexpr int32_t kState64RemainingShift = 59;
606  static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
607 
608  char16_t *ownedArray_;
609 
610  // Fixed value referencing the UCharsTrie words.
611  const char16_t *uchars_;
612 
613  // Iterator variables.
614 
615  // Pointer to next trie unit to read. NULL if no more matches.
616  const char16_t *pos_;
617  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
618  int32_t remainingMatchLength_;
619 };
620 
621 U_NAMESPACE_END
622 
623 #endif /* U_SHOW_CPLUSPLUS_API */
624 
625 #endif // __UCHARSTRIE_H__
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: ucharstrie.h:179
int32_t getValue() const
Returns a matching string&#39;s value if called immediately after current()/first()/next() returned USTRI...
Definition: ucharstrie.h:260
C++ API: Unicode String.
int32_t getValue() const
Definition: ucharstrie.h:368
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:35
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the char16_t array which...
Definition: ucharstrie.h:85
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition: ucharstrie.h:277
const UnicodeString & getString() const
Definition: ucharstrie.h:363
UCharsTrie & reset()
Resets this trie to its initial state.
Definition: ucharstrie.h:94
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition: ucharstrie.h:296
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
Builder class for UCharsTrie.
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:265
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition: ucharstrie.h:162
C++ API: Common ICU base class UObject.
State()
Constructs an empty State.
Definition: ucharstrie.h:146
UCharsTrie & resetToState64(uint64_t state)
Resets this trie to the saved state.
Definition: ucharstrie.h:128
uint64_t getState64() const
Returns the state of this trie as a 64-bit integer.
Definition: ucharstrie.h:109
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
UCharsTrie state object, for saving a trie&#39;s current state and resetting the trie back to this state ...
Definition: ucharstrie.h:140
const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types...
Definition: char16ptr.h:149
C API: Helper definitions for dictionary trie APIs.
UCharsTrie(ConstChar16Ptr trieUChars)
Constructs a UCharsTrie reader instance.
Definition: ucharstrie.h:69
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:269
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
The input unit(s) continued a matching string and there is a value for the string so far...
Definition: ustringtrie.h:66
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input char16_t.
Definition: ucharstrie.h:202
#define UINT64_C(c)
Provides a platform independent way to specify an unsigned 64-bit integer constant.
Definition: umachine.h:240
UMemory is the common ICU base class.
Definition: uobject.h:115
Light-weight, non-const reader class for a UCharsTrie.
Definition: ucharstrie.h:53
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261
Base class for objects to which Unicode characters and strings can be appended.
Definition: appendable.h:54