ICU 65.1  65.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
86 public:
91  ~Normalizer2();
92 
104  static const Normalizer2 *
105  getNFCInstance(UErrorCode &errorCode);
106 
118  static const Normalizer2 *
119  getNFDInstance(UErrorCode &errorCode);
120 
132  static const Normalizer2 *
133  getNFKCInstance(UErrorCode &errorCode);
134 
146  static const Normalizer2 *
147  getNFKDInstance(UErrorCode &errorCode);
148 
160  static const Normalizer2 *
161  getNFKCCasefoldInstance(UErrorCode &errorCode);
162 
184  static const Normalizer2 *
185  getInstance(const char *packageName,
186  const char *name,
187  UNormalization2Mode mode,
188  UErrorCode &errorCode);
189 
201  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202  UnicodeString result;
203  normalize(src, result, errorCode);
204  return result;
205  }
219  virtual UnicodeString &
220  normalize(const UnicodeString &src,
221  UnicodeString &dest,
222  UErrorCode &errorCode) const = 0;
223 
248  virtual void
249  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
250  Edits *edits, UErrorCode &errorCode) const;
251 
266  virtual UnicodeString &
267  normalizeSecondAndAppend(UnicodeString &first,
268  const UnicodeString &second,
269  UErrorCode &errorCode) const = 0;
284  virtual UnicodeString &
285  append(UnicodeString &first,
286  const UnicodeString &second,
287  UErrorCode &errorCode) const = 0;
288 
302  virtual UBool
303  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
304 
329  virtual UBool
330  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
331 
347  virtual UChar32
348  composePair(UChar32 a, UChar32 b) const;
349 
358  virtual uint8_t
359  getCombiningClass(UChar32 c) const;
360 
375  virtual UBool
376  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
398  virtual UBool
399  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
400 
401 
418  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
419 
442  virtual int32_t
443  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
444 
458  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
459 
474  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
475 
489  virtual UBool isInert(UChar32 c) const = 0;
490 };
491 
504 public:
515  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
516  norm2(n2), set(filterSet) {}
517 
523 
537  virtual UnicodeString &
538  normalize(const UnicodeString &src,
539  UnicodeString &dest,
540  UErrorCode &errorCode) const U_OVERRIDE;
541 
566  virtual void
567  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
568  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
569 
584  virtual UnicodeString &
586  const UnicodeString &second,
587  UErrorCode &errorCode) const U_OVERRIDE;
602  virtual UnicodeString &
603  append(UnicodeString &first,
604  const UnicodeString &second,
605  UErrorCode &errorCode) const U_OVERRIDE;
606 
618  virtual UBool
619  getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
620 
632  virtual UBool
633  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
634 
645  virtual UChar32
647 
656  virtual uint8_t
658 
670  virtual UBool
671  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
693  virtual UBool
694  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
707  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
719  virtual int32_t
720  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
721 
730  virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
731 
740  virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
741 
749  virtual UBool isInert(UChar32 c) const U_OVERRIDE;
750 private:
751  UnicodeString &
752  normalize(const UnicodeString &src,
753  UnicodeString &dest,
754  USetSpanCondition spanCondition,
755  UErrorCode &errorCode) const;
756 
757  void
758  normalizeUTF8(uint32_t options, const char *src, int32_t length,
759  ByteSink &sink, Edits *edits,
760  USetSpanCondition spanCondition,
761  UErrorCode &errorCode) const;
762 
763  UnicodeString &
765  const UnicodeString &second,
766  UBool doNormalize,
767  UErrorCode &errorCode) const;
768 
769  const Normalizer2 &norm2;
770  const UnicodeSet &set;
771 };
772 
773 U_NAMESPACE_END
774 
775 #endif // !UCONFIG_NO_NORMALIZATION
776 
777 #endif /* U_SHOW_CPLUSPLUS_API */
778 
779 #endif // __NORMALIZER2_H__
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
#define U_OVERRIDE
Defined to the C++11 "override" keyword if available.
Definition: umachine.h:129
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:201
Records lengths of string edits but not replacement text.
Definition: edits.h:80
C++ API: StringPiece: Read-only byte string wrapper class.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:515
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:281
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:156
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:45
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:503
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:94
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261
C++ API: Unicode Set.