ICU 66.0.1  66.0.1
uniset.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12 
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15 
16 #include "unicode/utypes.h"
17 
18 #if U_SHOW_CPLUSPLUS_API
19 
20 #include "unicode/ucpmap.h"
21 #include "unicode/unifilt.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uset.h"
24 
30 U_NAMESPACE_BEGIN
31 
32 // Forward Declarations.
33 class BMPSet;
34 class ParsePosition;
35 class RBBIRuleScanner;
36 class SymbolTable;
37 class UnicodeSetStringSpan;
38 class UVector;
39 class RuleCharacterIterator;
40 
282 private:
287  static constexpr int32_t INITIAL_CAPACITY = 25;
288  // fFlags constant
289  static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
290 
291  UChar32* list = stackList; // MUST be terminated with HIGH
292  int32_t capacity = INITIAL_CAPACITY; // capacity of list
293  int32_t len = 1; // length of list used; 1 <= len <= capacity
294  uint8_t fFlags = 0; // Bit flag (see constants above)
295 
296  BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
297  UChar32* buffer = nullptr; // internal buffer, may be NULL
298  int32_t bufferCapacity = 0; // capacity of buffer
299 
309  char16_t *pat = nullptr;
310  int32_t patLen = 0;
311 
312  UVector* strings = nullptr; // maintained in sorted order
313  UnicodeSetStringSpan *stringSpan = nullptr;
314 
320  UChar32 stackList[INITIAL_CAPACITY];
321 
322 public:
332  inline UBool isBogus(void) const;
333 
350  void setToBogus();
351 
352 public:
353 
354  enum {
359  MIN_VALUE = 0,
360 
365  MAX_VALUE = 0x10ffff
366  };
367 
368  //----------------------------------------------------------------
369  // Constructors &c
370  //----------------------------------------------------------------
371 
372 public:
373 
378  UnicodeSet();
379 
388  UnicodeSet(UChar32 start, UChar32 end);
389 
390 #ifndef U_HIDE_INTERNAL_API
391 
395  kSerialized /* result of serialize() */
396  };
397 
408  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
409  ESerialization serialization, UErrorCode &status);
410 #endif /* U_HIDE_INTERNAL_API */
411 
420  UnicodeSet(const UnicodeString& pattern,
421  UErrorCode& status);
422 
423 #ifndef U_HIDE_INTERNAL_API
424 
436  UnicodeSet(const UnicodeString& pattern,
437  uint32_t options,
438  const SymbolTable* symbols,
439  UErrorCode& status);
440 #endif /* U_HIDE_INTERNAL_API */
441 
455  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
456  uint32_t options,
457  const SymbolTable* symbols,
458  UErrorCode& status);
459 
464  UnicodeSet(const UnicodeSet& o);
465 
470  virtual ~UnicodeSet();
471 
477  UnicodeSet& operator=(const UnicodeSet& o);
478 
490  virtual UBool operator==(const UnicodeSet& o) const;
491 
497  inline UBool operator!=(const UnicodeSet& o) const;
498 
508  virtual UnicodeSet* clone() const;
509 
517  virtual int32_t hashCode(void) const;
518 
527  inline static UnicodeSet *fromUSet(USet *uset);
528 
537  inline static const UnicodeSet *fromUSet(const USet *uset);
538 
546  inline USet *toUSet();
547 
548 
556  inline const USet * toUSet() const;
557 
558 
559  //----------------------------------------------------------------
560  // Freezable API
561  //----------------------------------------------------------------
562 
571  inline UBool isFrozen() const;
572 
586  UnicodeSet *freeze();
587 
596  UnicodeSet *cloneAsThawed() const;
597 
598  //----------------------------------------------------------------
599  // Public API
600  //----------------------------------------------------------------
601 
611  UnicodeSet& set(UChar32 start, UChar32 end);
612 
618  static UBool resemblesPattern(const UnicodeString& pattern,
619  int32_t pos);
620 
633  UnicodeSet& applyPattern(const UnicodeString& pattern,
634  UErrorCode& status);
635 
636 #ifndef U_HIDE_INTERNAL_API
637 
653  UnicodeSet& applyPattern(const UnicodeString& pattern,
654  uint32_t options,
655  const SymbolTable* symbols,
656  UErrorCode& status);
657 #endif /* U_HIDE_INTERNAL_API */
658 
690  UnicodeSet& applyPattern(const UnicodeString& pattern,
691  ParsePosition& pos,
692  uint32_t options,
693  const SymbolTable* symbols,
694  UErrorCode& status);
695 
709  virtual UnicodeString& toPattern(UnicodeString& result,
710  UBool escapeUnprintable = FALSE) const;
711 
734  UnicodeSet& applyIntPropertyValue(UProperty prop,
735  int32_t value,
736  UErrorCode& ec);
737 
767  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
768  const UnicodeString& value,
769  UErrorCode& ec);
770 
779  virtual int32_t size(void) const;
780 
787  virtual UBool isEmpty(void) const;
788 
796  virtual UBool contains(UChar32 c) const;
797 
806  virtual UBool contains(UChar32 start, UChar32 end) const;
807 
815  UBool contains(const UnicodeString& s) const;
816 
824  virtual UBool containsAll(const UnicodeSet& c) const;
825 
833  UBool containsAll(const UnicodeString& s) const;
834 
843  UBool containsNone(UChar32 start, UChar32 end) const;
844 
852  UBool containsNone(const UnicodeSet& c) const;
853 
861  UBool containsNone(const UnicodeString& s) const;
862 
871  inline UBool containsSome(UChar32 start, UChar32 end) const;
872 
880  inline UBool containsSome(const UnicodeSet& s) const;
881 
889  inline UBool containsSome(const UnicodeString& s) const;
890 
909  int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
910 
923  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
924 
942  int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
943 
957  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
958 
977  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
978 
996  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
997 
1002  virtual UMatchDegree matches(const Replaceable& text,
1003  int32_t& offset,
1004  int32_t limit,
1005  UBool incremental);
1006 
1007 private:
1030  static int32_t matchRest(const Replaceable& text,
1031  int32_t start, int32_t limit,
1032  const UnicodeString& s);
1033 
1043  int32_t findCodePoint(UChar32 c) const;
1044 
1045 public:
1046 
1054  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1055 
1064  int32_t indexOf(UChar32 c) const;
1065 
1075  UChar32 charAt(int32_t index) const;
1076 
1091  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1092 
1100  UnicodeSet& add(UChar32 c);
1101 
1113  UnicodeSet& add(const UnicodeString& s);
1114 
1115  private:
1121  static int32_t getSingleCP(const UnicodeString& s);
1122 
1123  void _add(const UnicodeString& s);
1124 
1125  public:
1134  UnicodeSet& addAll(const UnicodeString& s);
1135 
1144  UnicodeSet& retainAll(const UnicodeString& s);
1145 
1154  UnicodeSet& complementAll(const UnicodeString& s);
1155 
1164  UnicodeSet& removeAll(const UnicodeString& s);
1165 
1174  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1175 
1176 
1184  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1185 
1199  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1200 
1201 
1207  UnicodeSet& retain(UChar32 c);
1208 
1222  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1223 
1231  UnicodeSet& remove(UChar32 c);
1232 
1242  UnicodeSet& remove(const UnicodeString& s);
1243 
1251  virtual UnicodeSet& complement(void);
1252 
1267  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1268 
1276  UnicodeSet& complement(UChar32 c);
1277 
1288  UnicodeSet& complement(const UnicodeString& s);
1289 
1302  virtual UnicodeSet& addAll(const UnicodeSet& c);
1303 
1315  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1316 
1328  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1329 
1340  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1341 
1348  virtual UnicodeSet& clear(void);
1349 
1375  UnicodeSet& closeOver(int32_t attribute);
1376 
1383  virtual UnicodeSet &removeAllStrings();
1384 
1392  virtual int32_t getRangeCount(void) const;
1393 
1401  virtual UChar32 getRangeStart(int32_t index) const;
1402 
1410  virtual UChar32 getRangeEnd(int32_t index) const;
1411 
1460  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1461 
1468  virtual UnicodeSet& compact();
1469 
1481  static UClassID U_EXPORT2 getStaticClassID(void);
1482 
1491  virtual UClassID getDynamicClassID(void) const;
1492 
1493 private:
1494 
1495  // Private API for the USet API
1496 
1497  friend class USetAccess;
1498 
1499  const UnicodeString* getString(int32_t index) const;
1500 
1501  //----------------------------------------------------------------
1502  // RuleBasedTransliterator support
1503  //----------------------------------------------------------------
1504 
1505 private:
1506 
1512  virtual UBool matchesIndexValue(uint8_t v) const;
1513 
1514 private:
1515  friend class RBBIRuleScanner;
1516 
1517  //----------------------------------------------------------------
1518  // Implementation: Clone as thawed (see ICU4J Freezable)
1519  //----------------------------------------------------------------
1520 
1521  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1522  UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1523 
1524  //----------------------------------------------------------------
1525  // Implementation: Pattern parsing
1526  //----------------------------------------------------------------
1527 
1528  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1529  ParsePosition& pos,
1530  const SymbolTable* symbols,
1531  UErrorCode& status);
1532 
1533  void applyPattern(RuleCharacterIterator& chars,
1534  const SymbolTable* symbols,
1535  UnicodeString& rebuiltPat,
1536  uint32_t options,
1537  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1538  int32_t depth,
1539  UErrorCode& ec);
1540 
1541  //----------------------------------------------------------------
1542  // Implementation: Utility methods
1543  //----------------------------------------------------------------
1544 
1545  static int32_t nextCapacity(int32_t minCapacity);
1546 
1547  bool ensureCapacity(int32_t newLen);
1548 
1549  bool ensureBufferCapacity(int32_t newLen);
1550 
1551  void swapBuffers(void);
1552 
1553  UBool allocateStrings(UErrorCode &status);
1554  UBool hasStrings() const;
1555  int32_t stringsSize() const;
1556  UBool stringsContains(const UnicodeString &s) const;
1557 
1558  UnicodeString& _toPattern(UnicodeString& result,
1559  UBool escapeUnprintable) const;
1560 
1561  UnicodeString& _generatePattern(UnicodeString& result,
1562  UBool escapeUnprintable) const;
1563 
1564  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1565 
1566  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1567 
1568  //----------------------------------------------------------------
1569  // Implementation: Fundamental operators
1570  //----------------------------------------------------------------
1571 
1572  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1573 
1574  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1575 
1576  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1577 
1583  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1584  int32_t pos);
1585 
1586  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1587  int32_t iterOpts);
1588 
1628  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1629  ParsePosition& ppos,
1630  UErrorCode &ec);
1631 
1632  void applyPropertyPattern(RuleCharacterIterator& chars,
1633  UnicodeString& rebuiltPat,
1634  UErrorCode& ec);
1635 
1636  static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1637 
1642  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1643 
1653  void applyFilter(Filter filter,
1654  void* context,
1655  const UnicodeSet* inclusions,
1656  UErrorCode &status);
1657 
1658  // UCPMap is now stable ICU 63
1659  void applyIntPropertyValue(const UCPMap *map,
1660  UCPMapValueFilter *filter, const void *context,
1661  UErrorCode &errorCode);
1662 
1666  void setPattern(const UnicodeString& newPat) {
1667  setPattern(newPat.getBuffer(), newPat.length());
1668  }
1669  void setPattern(const char16_t *newPat, int32_t newPatLen);
1673  void releasePattern();
1674 
1675  friend class UnicodeSetIterator;
1676 };
1677 
1678 
1679 
1680 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1681  return !operator==(o);
1682 }
1683 
1684 inline UBool UnicodeSet::isFrozen() const {
1685  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1686 }
1687 
1688 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1689  return !containsNone(start, end);
1690 }
1691 
1693  return !containsNone(s);
1694 }
1695 
1697  return !containsNone(s);
1698 }
1699 
1700 inline UBool UnicodeSet::isBogus() const {
1701  return (UBool)(fFlags & kIsBogus);
1702 }
1703 
1705  return reinterpret_cast<UnicodeSet *>(uset);
1706 }
1707 
1708 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1709  return reinterpret_cast<const UnicodeSet *>(uset);
1710 }
1711 
1713  return reinterpret_cast<USet *>(this);
1714 }
1715 
1716 inline const USet *UnicodeSet::toUSet() const {
1717  return reinterpret_cast<const USet *>(this);
1718 }
1719 
1720 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1721  int32_t sLength=s.length();
1722  if(start<0) {
1723  start=0;
1724  } else if(start>sLength) {
1725  start=sLength;
1726  }
1727  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1728 }
1729 
1730 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1731  int32_t sLength=s.length();
1732  if(limit<0) {
1733  limit=0;
1734  } else if(limit>sLength) {
1735  limit=sLength;
1736  }
1737  return spanBack(s.getBuffer(), limit, spanCondition);
1738 }
1739 
1740 U_NAMESPACE_END
1741 
1742 #endif /* U_SHOW_CPLUSPLUS_API */
1743 
1744 #endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition: RunArrays.h:32
static UClassID getStaticClassID()
ICU "poor man&#39;s RTTI", returns a UClassID for this class.
struct UCPMap UCPMap
Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
Definition: ucpmap.h:31
int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:33
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition: usetiter.h:66
UBool isBogus(void) const
Determine if this object contains a valid set.
Definition: uniset.h:1700
UBool operator!=(const UnicodeSet &o) const
Compares the specified object with this set for equality.
Definition: uniset.h:1680
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
This file defines an abstract map from Unicode code points to integer values.
static UnicodeSet * fromUSet(USet *uset)
Get a UnicodeSet pointer from a USet.
Definition: uniset.h:1704
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c & 0xFF == v, at offset, in the forward direction (with limit > offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:59
virtual UClassID getDynamicClassID(void) const =0
Returns a unique class ID polymorphically.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:77
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:65
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:251
uint32_t UCPMapValueFilter(const void *context, uint32_t value)
Callback function type: Modifies a map value.
Definition: ucpmap.h:114
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher API.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:281
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:156
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:195
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition: uset.h:47
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3890
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:52
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition: umachine.h:140
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual UnicodeFilter * clone() const =0
Clones this object polymorphically.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const =0
Returns a string representation of this matcher.
Basic definitions for ICU, for both C and C++ APIs.
UBool containsSome(UChar32 start, UChar32 end) const
Returns true if this set contains one or more of the characters in the given range.
Definition: uniset.h:1688
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:269
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
UBool isFrozen() const
Determines whether the set has been frozen (made immutable) or not.
Definition: uniset.h:1684
USet * toUSet()
Produce a USet * pointer for this UnicodeSet.
Definition: uniset.h:1712
C++ API: Unicode Filter.
int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261