ICU 57.1  57.1
uniset.h
Go to the documentation of this file.
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2016, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 ***************************************************************************
9 */
10 
11 #ifndef UNICODESET_H
12 #define UNICODESET_H
13 
14 #include "unicode/unifilt.h"
15 #include "unicode/unistr.h"
16 #include "unicode/uset.h"
17 
24 
25 // Forward Declarations.
26 void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
28 class BMPSet;
29 class ParsePosition;
30 class RBBIRuleScanner;
31 class SymbolTable;
32 class UnicodeSetStringSpan;
33 class UVector;
34 class RuleCharacterIterator;
35 
276 class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
277 
278  int32_t len; // length of list used; 0 <= len <= capacity
279  int32_t capacity; // capacity of list
280  UChar32* list; // MUST be terminated with HIGH
281  BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
282  UChar32* buffer; // internal buffer, may be NULL
283  int32_t bufferCapacity; // capacity of buffer
284  int32_t patLen;
285 
295  UChar *pat;
296  UVector* strings; // maintained in sorted order
297  UnicodeSetStringSpan *stringSpan;
298 
299 private:
300  enum { // constants
301  kIsBogus = 1 // This set is bogus (i.e. not valid)
302  };
303  uint8_t fFlags; // Bit flag (see constants above)
304 public:
314  inline UBool isBogus(void) const;
315 
332  void setToBogus();
333 
334 public:
335 
336  enum {
341  MIN_VALUE = 0,
342 
347  MAX_VALUE = 0x10ffff
348  };
349 
350  //----------------------------------------------------------------
351  // Constructors &c
352  //----------------------------------------------------------------
353 
354 public:
355 
360  UnicodeSet();
361 
370  UnicodeSet(UChar32 start, UChar32 end);
371 
372 #ifndef U_HIDE_INTERNAL_API
373 
377  kSerialized /* result of serialize() */
378  };
379 
390  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
391  ESerialization serialization, UErrorCode &status);
392 #endif /* U_HIDE_INTERNAL_API */
393 
402  UnicodeSet(const UnicodeString& pattern,
403  UErrorCode& status);
404 
405 #ifndef U_HIDE_INTERNAL_API
406 
418  UnicodeSet(const UnicodeString& pattern,
419  uint32_t options,
420  const SymbolTable* symbols,
421  UErrorCode& status);
422 #endif /* U_HIDE_INTERNAL_API */
423 
437  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
438  uint32_t options,
439  const SymbolTable* symbols,
440  UErrorCode& status);
441 
446  UnicodeSet(const UnicodeSet& o);
447 
452  virtual ~UnicodeSet();
453 
459  UnicodeSet& operator=(const UnicodeSet& o);
460 
472  virtual UBool operator==(const UnicodeSet& o) const;
473 
479  UBool operator!=(const UnicodeSet& o) const;
480 
490  virtual UnicodeFunctor* clone() const;
491 
499  virtual int32_t hashCode(void) const;
500 
509  inline static UnicodeSet *fromUSet(USet *uset);
510 
519  inline static const UnicodeSet *fromUSet(const USet *uset);
520 
528  inline USet *toUSet();
529 
530 
538  inline const USet * toUSet() const;
539 
540 
541  //----------------------------------------------------------------
542  // Freezable API
543  //----------------------------------------------------------------
544 
553  inline UBool isFrozen() const;
554 
568  UnicodeFunctor *freeze();
569 
578  UnicodeFunctor *cloneAsThawed() const;
579 
580  //----------------------------------------------------------------
581  // Public API
582  //----------------------------------------------------------------
583 
594  UnicodeSet& set(UChar32 start, UChar32 end);
595 
601  static UBool resemblesPattern(const UnicodeString& pattern,
602  int32_t pos);
603 
616  UnicodeSet& applyPattern(const UnicodeString& pattern,
617  UErrorCode& status);
618 
619 #ifndef U_HIDE_INTERNAL_API
620 
636  UnicodeSet& applyPattern(const UnicodeString& pattern,
637  uint32_t options,
638  const SymbolTable* symbols,
639  UErrorCode& status);
640 #endif /* U_HIDE_INTERNAL_API */
641 
673  UnicodeSet& applyPattern(const UnicodeString& pattern,
674  ParsePosition& pos,
675  uint32_t options,
676  const SymbolTable* symbols,
677  UErrorCode& status);
678 
692  virtual UnicodeString& toPattern(UnicodeString& result,
693  UBool escapeUnprintable = FALSE) const;
694 
717  UnicodeSet& applyIntPropertyValue(UProperty prop,
718  int32_t value,
719  UErrorCode& ec);
720 
750  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
751  const UnicodeString& value,
752  UErrorCode& ec);
753 
762  virtual int32_t size(void) const;
763 
770  virtual UBool isEmpty(void) const;
771 
779  virtual UBool contains(UChar32 c) const;
780 
789  virtual UBool contains(UChar32 start, UChar32 end) const;
790 
798  UBool contains(const UnicodeString& s) const;
799 
807  virtual UBool containsAll(const UnicodeSet& c) const;
808 
816  UBool containsAll(const UnicodeString& s) const;
817 
826  UBool containsNone(UChar32 start, UChar32 end) const;
827 
835  UBool containsNone(const UnicodeSet& c) const;
836 
844  UBool containsNone(const UnicodeString& s) const;
845 
854  inline UBool containsSome(UChar32 start, UChar32 end) const;
855 
863  inline UBool containsSome(const UnicodeSet& s) const;
864 
872  inline UBool containsSome(const UnicodeString& s) const;
873 
892  int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
893 
906  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
907 
925  int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
926 
940  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
941 
960  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
961 
979  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
980 
985  virtual UMatchDegree matches(const Replaceable& text,
986  int32_t& offset,
987  int32_t limit,
988  UBool incremental);
989 
990 private:
1013  static int32_t matchRest(const Replaceable& text,
1014  int32_t start, int32_t limit,
1015  const UnicodeString& s);
1016 
1026  int32_t findCodePoint(UChar32 c) const;
1027 
1028 public:
1029 
1037  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1038 
1047  int32_t indexOf(UChar32 c) const;
1048 
1058  UChar32 charAt(int32_t index) const;
1059 
1074  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1075 
1083  UnicodeSet& add(UChar32 c);
1084 
1096  UnicodeSet& add(const UnicodeString& s);
1097 
1098  private:
1104  static int32_t getSingleCP(const UnicodeString& s);
1105 
1106  void _add(const UnicodeString& s);
1107 
1108  public:
1117  UnicodeSet& addAll(const UnicodeString& s);
1118 
1127  UnicodeSet& retainAll(const UnicodeString& s);
1128 
1137  UnicodeSet& complementAll(const UnicodeString& s);
1138 
1147  UnicodeSet& removeAll(const UnicodeString& s);
1148 
1157  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1158 
1159 
1167  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1168 
1182  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1183 
1184 
1190  UnicodeSet& retain(UChar32 c);
1191 
1205  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1206 
1214  UnicodeSet& remove(UChar32 c);
1215 
1225  UnicodeSet& remove(const UnicodeString& s);
1226 
1234  virtual UnicodeSet& complement(void);
1235 
1250  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1251 
1259  UnicodeSet& complement(UChar32 c);
1260 
1271  UnicodeSet& complement(const UnicodeString& s);
1272 
1285  virtual UnicodeSet& addAll(const UnicodeSet& c);
1286 
1298  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1299 
1311  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1312 
1323  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1324 
1331  virtual UnicodeSet& clear(void);
1332 
1358  UnicodeSet& closeOver(int32_t attribute);
1359 
1366  virtual UnicodeSet &removeAllStrings();
1367 
1375  virtual int32_t getRangeCount(void) const;
1376 
1384  virtual UChar32 getRangeStart(int32_t index) const;
1385 
1393  virtual UChar32 getRangeEnd(int32_t index) const;
1394 
1443  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1444 
1451  virtual UnicodeSet& compact();
1452 
1464  static UClassID U_EXPORT2 getStaticClassID(void);
1465 
1474  virtual UClassID getDynamicClassID(void) const;
1475 
1476 private:
1477 
1478  // Private API for the USet API
1479 
1480  friend class USetAccess;
1481 
1482  int32_t getStringCount() const;
1483 
1484  const UnicodeString* getString(int32_t index) const;
1485 
1486  //----------------------------------------------------------------
1487  // RuleBasedTransliterator support
1488  //----------------------------------------------------------------
1489 
1490 private:
1491 
1497  virtual UBool matchesIndexValue(uint8_t v) const;
1498 
1499 private:
1500  friend class RBBIRuleScanner;
1501 
1502  //----------------------------------------------------------------
1503  // Implementation: Clone as thawed (see ICU4J Freezable)
1504  //----------------------------------------------------------------
1505 
1506  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1507 
1508  //----------------------------------------------------------------
1509  // Implementation: Pattern parsing
1510  //----------------------------------------------------------------
1511 
1512  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1513  ParsePosition& pos,
1514  const SymbolTable* symbols,
1515  UErrorCode& status);
1516 
1517  void applyPattern(RuleCharacterIterator& chars,
1518  const SymbolTable* symbols,
1519  UnicodeString& rebuiltPat,
1520  uint32_t options,
1521  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1522  UErrorCode& ec);
1523 
1524  //----------------------------------------------------------------
1525  // Implementation: Utility methods
1526  //----------------------------------------------------------------
1527 
1528  void ensureCapacity(int32_t newLen, UErrorCode& ec);
1529 
1530  void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
1531 
1532  void swapBuffers(void);
1533 
1534  UBool allocateStrings(UErrorCode &status);
1535 
1536  UnicodeString& _toPattern(UnicodeString& result,
1537  UBool escapeUnprintable) const;
1538 
1539  UnicodeString& _generatePattern(UnicodeString& result,
1540  UBool escapeUnprintable) const;
1541 
1542  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1543 
1544  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1545 
1546  //----------------------------------------------------------------
1547  // Implementation: Fundamental operators
1548  //----------------------------------------------------------------
1549 
1550  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1551 
1552  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1553 
1554  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1555 
1561  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1562  int32_t pos);
1563 
1564  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1565  int32_t iterOpts);
1566 
1606  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1607  ParsePosition& ppos,
1608  UErrorCode &ec);
1609 
1610  void applyPropertyPattern(RuleCharacterIterator& chars,
1611  UnicodeString& rebuiltPat,
1612  UErrorCode& ec);
1613 
1614  friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
1615  static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1616 
1621  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1622 
1632  void applyFilter(Filter filter,
1633  void* context,
1634  int32_t src,
1635  UErrorCode &status);
1636 
1640  void setPattern(const UnicodeString& newPat);
1644  void releasePattern();
1645 
1646  friend class UnicodeSetIterator;
1647 };
1648 
1649 
1650 
1651 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1652  return !operator==(o);
1653 }
1654 
1655 inline UBool UnicodeSet::isFrozen() const {
1656  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1657 }
1658 
1659 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1660  return !containsNone(start, end);
1661 }
1662 
1664  return !containsNone(s);
1665 }
1666 
1668  return !containsNone(s);
1669 }
1670 
1671 inline UBool UnicodeSet::isBogus() const {
1672  return (UBool)(fFlags & kIsBogus);
1673 }
1674 
1676  return reinterpret_cast<UnicodeSet *>(uset);
1677 }
1678 
1679 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1680  return reinterpret_cast<const UnicodeSet *>(uset);
1681 }
1682 
1684  return reinterpret_cast<USet *>(this);
1685 }
1686 
1687 inline const USet *UnicodeSet::toUSet() const {
1688  return reinterpret_cast<const USet *>(this);
1689 }
1690 
1691 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1692  int32_t sLength=s.length();
1693  if(start<0) {
1694  start=0;
1695  } else if(start>sLength) {
1696  start=sLength;
1697  }
1698  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1699 }
1700 
1701 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1702  int32_t sLength=s.length();
1703  if(limit<0) {
1704  limit=0;
1705  } else if(limit>sLength) {
1706  limit=sLength;
1707  }
1708  return spanBack(s.getBuffer(), limit, spanCondition);
1709 }
1710 
1712 
1713 #endif
static UClassID getStaticClassID()
ICU "poor man&#39;s RTTI", returns a UClassID for this class.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:30
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition: usetiter.h:61
UBool isBogus(void) const
Determine if this object contains a valid set.
Definition: uniset.h:1671
#define U_CALLCONV
Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary in callback function typedefs to ma...
Definition: platform.h:849
UBool operator!=(const UnicodeSet &o) const
Compares the specified object with this set for equality.
Definition: uniset.h:1651
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:91
static UnicodeSet * fromUSet(USet *uset)
Get a UnicodeSet pointer from a USet.
Definition: uniset.h:1675
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c & 0xFF == v, at offset, in the forward direction (with limit > offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:54
virtual UClassID getDynamicClassID(void) const =0
Returns a unique class ID polymorphically.
virtual UnicodeFunctor * clone() const =0
Return a copy of this object.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:71
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:59
UChar * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:332
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
UnicodeFunctor is an abstract base class for objects that perform match and/or replace operations on ...
Definition: unifunct.h:33
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher API.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:150
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:312
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
struct USet USet
Definition: ucnv.h:67
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:161
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3794
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:47
void UnicodeSet_initInclusion(int32_t src, UErrorCode &status)
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const =0
Returns a string representation of this matcher.
UBool containsSome(UChar32 start, UChar32 end) const
Returns true if this set contains one or more of the characters in the given range.
Definition: uniset.h:1659
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:242
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:357
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:293
UBool isFrozen() const
Determines whether the set has been frozen (made immutable) or not.
Definition: uniset.h:1655
USet * toUSet()
Produce a USet * pointer for this UnicodeSet.
Definition: uniset.h:1683
C++ API: Unicode Filter.
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234