ICU 66.0.1  66.0.1
regex.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: regex.h
9 * encoding: UTF-8
10 * indentation:4
11 *
12 * created on: 2002oct22
13 * created by: Andy Heninger
14 *
15 * ICU Regular Expressions, API for C++
16 */
17 
18 #ifndef REGEX_H
19 #define REGEX_H
20 
21 //#define REGEX_DEBUG
22 
45 #include "unicode/utypes.h"
46 
47 #if U_SHOW_CPLUSPLUS_API
48 
49 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
50 
51 #include "unicode/uobject.h"
52 #include "unicode/unistr.h"
53 #include "unicode/utext.h"
54 #include "unicode/parseerr.h"
55 
56 #include "unicode/uregex.h"
57 
58 // Forward Declarations
59 
60 struct UHashtable;
61 
62 U_NAMESPACE_BEGIN
63 
64 struct Regex8BitSet;
65 class RegexCImpl;
66 class RegexMatcher;
67 class RegexPattern;
68 struct REStackFrame;
69 class RuleBasedBreakIterator;
70 class UnicodeSet;
71 class UVector;
72 class UVector32;
73 class UVector64;
74 
75 
88 public:
89 
97  RegexPattern();
98 
105  RegexPattern(const RegexPattern &source);
106 
112  virtual ~RegexPattern();
113 
122  UBool operator==(const RegexPattern& that) const;
123 
132  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
133 
139  RegexPattern &operator =(const RegexPattern &source);
140 
148  virtual RegexPattern *clone() const;
149 
150 
175  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
176  UParseError &pe,
177  UErrorCode &status);
178 
205  static RegexPattern * U_EXPORT2 compile( UText *regex,
206  UParseError &pe,
207  UErrorCode &status);
208 
233  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
234  uint32_t flags,
235  UParseError &pe,
236  UErrorCode &status);
237 
264  static RegexPattern * U_EXPORT2 compile( UText *regex,
265  uint32_t flags,
266  UParseError &pe,
267  UErrorCode &status);
268 
291  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
292  uint32_t flags,
293  UErrorCode &status);
294 
319  static RegexPattern * U_EXPORT2 compile( UText *regex,
320  uint32_t flags,
321  UErrorCode &status);
322 
328  virtual uint32_t flags() const;
329 
347  virtual RegexMatcher *matcher(const UnicodeString &input,
348  UErrorCode &status) const;
349 
350 private:
363  RegexMatcher *matcher(const char16_t *input,
364  UErrorCode &status) const;
365 public:
366 
367 
379  virtual RegexMatcher *matcher(UErrorCode &status) const;
380 
381 
396  static UBool U_EXPORT2 matches(const UnicodeString &regex,
397  const UnicodeString &input,
398  UParseError &pe,
399  UErrorCode &status);
400 
415  static UBool U_EXPORT2 matches(UText *regex,
416  UText *input,
417  UParseError &pe,
418  UErrorCode &status);
419 
428  virtual UnicodeString pattern() const;
429 
430 
441  virtual UText *patternText(UErrorCode &status) const;
442 
443 
457  virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
458 
459 
476  virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
477 
478 
517  virtual int32_t split(const UnicodeString &input,
518  UnicodeString dest[],
519  int32_t destCapacity,
520  UErrorCode &status) const;
521 
522 
561  virtual int32_t split(UText *input,
562  UText *dest[],
563  int32_t destCapacity,
564  UErrorCode &status) const;
565 
566 
572  virtual UClassID getDynamicClassID() const;
573 
579  static UClassID U_EXPORT2 getStaticClassID();
580 
581 private:
582  //
583  // Implementation Data
584  //
585  UText *fPattern; // The original pattern string.
586  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
587  uint32_t fFlags; // The flags used when compiling the pattern.
588  //
589  UVector64 *fCompiledPat; // The compiled pattern p-code.
590  UnicodeString fLiteralText; // Any literal string data from the pattern,
591  // after un-escaping, for use during the match.
592 
593  UVector *fSets; // Any UnicodeSets referenced from the pattern.
594  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
595 
596 
597  UErrorCode fDeferredStatus; // status if some prior error has left this
598  // RegexPattern in an unusable state.
599 
600  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
601  // >= this value. For some patterns, this calculated
602  // value may be less than the true shortest
603  // possible match.
604 
605  int32_t fFrameSize; // Size of a state stack frame in the
606  // execution engine.
607 
608  int32_t fDataSize; // The size of the data needed by the pattern that
609  // does not go on the state stack, but has just
610  // a single copy per matcher.
611 
612  UVector32 *fGroupMap; // Map from capture group number to position of
613  // the group's variables in the matcher stack frame.
614 
615  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
616  // regex character classes, e.g. Word.
617 
618  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
619  // sets for predefined regex classes.
620 
621  int32_t fStartType; // Info on how a match must start.
622  int32_t fInitialStringIdx; //
623  int32_t fInitialStringLen;
624  UnicodeSet *fInitialChars;
625  UChar32 fInitialChar;
626  Regex8BitSet *fInitialChars8;
627  UBool fNeedsAltInput;
628 
629  UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
630 
631  friend class RegexCompile;
632  friend class RegexMatcher;
633  friend class RegexCImpl;
634 
635  //
636  // Implementation Methods
637  //
638  void init(); // Common initialization, for use by constructors.
639  bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap.
640  void zap(); // Common cleanup
641 
642  void dumpOp(int32_t index) const;
643 
644  public:
645 #ifndef U_HIDE_INTERNAL_API
646 
650  void dumpPattern() const;
651 #endif /* U_HIDE_INTERNAL_API */
652 };
653 
654 
655 
666 public:
667 
681  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
682 
697  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
698 
719  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
720  uint32_t flags, UErrorCode &status);
721 
742  RegexMatcher(UText *regexp, UText *input,
743  uint32_t flags, UErrorCode &status);
744 
745 private:
757  RegexMatcher(const UnicodeString &regexp, const char16_t *input,
758  uint32_t flags, UErrorCode &status);
759 public:
760 
761 
767  virtual ~RegexMatcher();
768 
769 
776  virtual UBool matches(UErrorCode &status);
777 
778 
789  virtual UBool matches(int64_t startIndex, UErrorCode &status);
790 
791 
805  virtual UBool lookingAt(UErrorCode &status);
806 
807 
821  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
822 
823 
836  virtual UBool find();
837 
838 
853  virtual UBool find(UErrorCode &status);
854 
864  virtual UBool find(int64_t start, UErrorCode &status);
865 
866 
876  virtual UnicodeString group(UErrorCode &status) const;
877 
878 
896  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
897 
903  virtual int32_t groupCount() const;
904 
905 
920  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
921 
942  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
943 
951  virtual int32_t start(UErrorCode &status) const;
952 
960  virtual int64_t start64(UErrorCode &status) const;
961 
962 
976  virtual int32_t start(int32_t group, UErrorCode &status) const;
977 
991  virtual int64_t start64(int32_t group, UErrorCode &status) const;
992 
1006  virtual int32_t end(UErrorCode &status) const;
1007 
1021  virtual int64_t end64(UErrorCode &status) const;
1022 
1023 
1041  virtual int32_t end(int32_t group, UErrorCode &status) const;
1042 
1060  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1061 
1070  virtual RegexMatcher &reset();
1071 
1072 
1088  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1089 
1090 
1108  virtual RegexMatcher &reset(const UnicodeString &input);
1109 
1110 
1124  virtual RegexMatcher &reset(UText *input);
1125 
1126 
1151  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1152 
1153 private:
1166  RegexMatcher &reset(const char16_t *input);
1167 public:
1168 
1176  virtual const UnicodeString &input() const;
1177 
1186  virtual UText *inputText() const;
1187 
1198  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1199 
1200 
1219  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1220 
1232  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1233 
1242  virtual int32_t regionStart() const;
1243 
1252  virtual int64_t regionStart64() const;
1253 
1254 
1263  virtual int32_t regionEnd() const;
1264 
1273  virtual int64_t regionEnd64() const;
1274 
1283  virtual UBool hasTransparentBounds() const;
1284 
1303  virtual RegexMatcher &useTransparentBounds(UBool b);
1304 
1305 
1313  virtual UBool hasAnchoringBounds() const;
1314 
1315 
1328  virtual RegexMatcher &useAnchoringBounds(UBool b);
1329 
1330 
1343  virtual UBool hitEnd() const;
1344 
1354  virtual UBool requireEnd() const;
1355 
1356 
1362  virtual const RegexPattern &pattern() const;
1363 
1364 
1381  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1382 
1383 
1404  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1405 
1406 
1427  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1428 
1429 
1454  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1455 
1456 
1484  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1485  const UnicodeString &replacement, UErrorCode &status);
1486 
1487 
1515  virtual RegexMatcher &appendReplacement(UText *dest,
1516  UText *replacement, UErrorCode &status);
1517 
1518 
1529  virtual UnicodeString &appendTail(UnicodeString &dest);
1530 
1531 
1545  virtual UText *appendTail(UText *dest, UErrorCode &status);
1546 
1547 
1571  virtual int32_t split(const UnicodeString &input,
1572  UnicodeString dest[],
1573  int32_t destCapacity,
1574  UErrorCode &status);
1575 
1576 
1600  virtual int32_t split(UText *input,
1601  UText *dest[],
1602  int32_t destCapacity,
1603  UErrorCode &status);
1604 
1626  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1627 
1634  virtual int32_t getTimeLimit() const;
1635 
1657  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1658 
1666  virtual int32_t getStackLimit() const;
1667 
1668 
1682  virtual void setMatchCallback(URegexMatchCallback *callback,
1683  const void *context,
1684  UErrorCode &status);
1685 
1686 
1697  virtual void getMatchCallback(URegexMatchCallback *&callback,
1698  const void *&context,
1699  UErrorCode &status);
1700 
1701 
1715  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1716  const void *context,
1717  UErrorCode &status);
1718 
1719 
1730  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1731  const void *&context,
1732  UErrorCode &status);
1733 
1734 #ifndef U_HIDE_INTERNAL_API
1735 
1740  void setTrace(UBool state);
1741 #endif /* U_HIDE_INTERNAL_API */
1742 
1748  static UClassID U_EXPORT2 getStaticClassID();
1749 
1755  virtual UClassID getDynamicClassID() const;
1756 
1757 private:
1758  // Constructors and other object boilerplate are private.
1759  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1760  RegexMatcher(); // default constructor not implemented
1761  RegexMatcher(const RegexPattern *pat);
1762  RegexMatcher(const RegexMatcher &other);
1763  RegexMatcher &operator =(const RegexMatcher &rhs);
1764  void init(UErrorCode &status); // Common initialization
1765  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1766 
1767  friend class RegexPattern;
1768  friend class RegexCImpl;
1769 public:
1770 #ifndef U_HIDE_INTERNAL_API
1771 
1772  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1773 #endif /* U_HIDE_INTERNAL_API */
1774 private:
1775 
1776  //
1777  // MatchAt This is the internal interface to the match engine itself.
1778  // Match status comes back in matcher member variables.
1779  //
1780  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1781  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1782  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1783  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1784  REStackFrame *resetStack();
1785  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1786  void IncrementTime(UErrorCode &status);
1787 
1788  // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1789  inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1790 
1791  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1792 
1793  UBool findUsingChunk(UErrorCode &status);
1794  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1795  UBool isChunkWordBoundary(int32_t pos);
1796 
1797  const RegexPattern *fPattern;
1798  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1799  // should delete it when through.
1800 
1801  const UnicodeString *fInput; // The string being matched. Only used for input()
1802  UText *fInputText; // The text being matched. Is never NULL.
1803  UText *fAltInputText; // A shallow copy of the text being matched.
1804  // Only created if the pattern contains backreferences.
1805  int64_t fInputLength; // Full length of the input text.
1806  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1807 
1808  int64_t fRegionStart; // Start of the input region, default = 0.
1809  int64_t fRegionLimit; // End of input region, default to input.length.
1810 
1811  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1812  int64_t fAnchorLimit; // See useAnchoringBounds
1813 
1814  int64_t fLookStart; // Region bounds for look-ahead/behind and
1815  int64_t fLookLimit; // and other boundary tests. See
1816  // useTransparentBounds
1817 
1818  int64_t fActiveStart; // Currently active bounds for matching.
1819  int64_t fActiveLimit; // Usually is the same as region, but
1820  // is changed to fLookStart/Limit when
1821  // entering look around regions.
1822 
1823  UBool fTransparentBounds; // True if using transparent bounds.
1824  UBool fAnchoringBounds; // True if using anchoring bounds.
1825 
1826  UBool fMatch; // True if the last attempted match was successful.
1827  int64_t fMatchStart; // Position of the start of the most recent match
1828  int64_t fMatchEnd; // First position after the end of the most recent match
1829  // Zero if no previous match, even when a region
1830  // is active.
1831  int64_t fLastMatchEnd; // First position after the end of the previous match,
1832  // or -1 if there was no previous match.
1833  int64_t fAppendPosition; // First position after the end of the previous
1834  // appendReplacement(). As described by the
1835  // JavaDoc for Java Matcher, where it is called
1836  // "append position"
1837  UBool fHitEnd; // True if the last match touched the end of input.
1838  UBool fRequireEnd; // True if the last match required end-of-input
1839  // (matched $ or Z)
1840 
1841  UVector64 *fStack;
1842  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1843  // which will contain the capture group results.
1844  // NOT valid while match engine is running.
1845 
1846  int64_t *fData; // Data area for use by the compiled pattern.
1847  int64_t fSmallData[8]; // Use this for data if it's enough.
1848 
1849  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1850  // match engine run. Zero for unlimited.
1851 
1852  int32_t fTime; // Match time, accumulates while matching.
1853  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1854  // Kept separately from fTime to keep as much
1855  // code as possible out of the inline
1856  // StateSave function.
1857 
1858  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1859  // stack, in bytes. Zero for unlimited.
1860 
1861  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1862  // NULL if there is no callback.
1863  const void *fCallbackContext; // User Context ptr for callback function.
1864 
1865  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1866  // NULL if there is no callback.
1867  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1868 
1869 
1870  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1871 
1872  UBool fTraceDebug; // Set true for debug tracing of match engine.
1873 
1874  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1875  // reported, or that permanently disables this matcher.
1876 
1877  RuleBasedBreakIterator *fWordBreakItr;
1878 };
1879 
1880 U_NAMESPACE_END
1881 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1882 
1883 #endif /* U_SHOW_CPLUSPLUS_API */
1884 
1885 #endif
struct UHashtable UHashtable
Definition: msgfmt.h:43
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:96
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:87
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1573
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:665
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:132
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:301
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:281
C++ API: Common ICU base class UObject.
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1499
C API: Parse Error Information.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition: umachine.h:140
UText struct.
Definition: utext.h:1347
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:55
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261