Lucene++ - a full-featured, c++ search engine
API Documentation


UTF8Stream.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef UTF8STREAM_H
8 #define UTF8STREAM_H
9 
10 #include "LuceneObject.h"
11 
12 namespace Lucene {
13 
14 class LPPAPI UTF8Base : public LuceneObject {
15 public:
16  virtual ~UTF8Base();
18 
19 public:
20  static const uint16_t LEAD_SURROGATE_MIN;
21  static const uint16_t LEAD_SURROGATE_MAX;
22  static const uint16_t TRAIL_SURROGATE_MIN;
23  static const uint16_t TRAIL_SURROGATE_MAX;
24  static const uint16_t LEAD_OFFSET;
25  static const uint32_t SURROGATE_OFFSET;
26  static const uint32_t CODE_POINT_MAX;
27 
28  static const wchar_t UNICODE_REPLACEMENT_CHAR;
29  static const wchar_t UNICODE_TERMINATOR;
30 
31 protected:
32  virtual uint32_t readNext() = 0;
33 
34  uint8_t mask8(uint32_t b);
35  uint16_t mask16(uint32_t c);
36  bool isTrail(uint32_t b);
37  bool isSurrogate(uint32_t cp);
38  bool isLeadSurrogate(uint32_t cp);
39  bool isTrailSurrogate(uint32_t cp);
40  bool isValidCodePoint(uint32_t cp);
41  bool isOverlongSequence(uint32_t cp, int32_t length);
42 };
43 
44 class UTF8Encoder : public UTF8Base {
45 public:
46  UTF8Encoder(const wchar_t* unicodeBegin, const wchar_t* unicodeEnd);
47  virtual ~UTF8Encoder();
48 
50 
51 protected:
52  const wchar_t* unicodeBegin;
53  const wchar_t* unicodeEnd;
54 
55 public:
56  int32_t encode(uint8_t* utf8, int32_t length);
57 
58  int32_t utf16to8(uint8_t* utf8, int32_t length);
59  int32_t utf32to8(uint8_t* utf8, int32_t length);
60 
61 protected:
62  virtual uint32_t readNext();
63 
64  uint8_t* appendChar(uint8_t* utf8, uint32_t cp);
65 };
66 
68 public:
70  virtual ~UTF8EncoderStream();
71 
73 
74 protected:
76 
77 protected:
78  virtual uint32_t readNext();
79 };
80 
81 class UTF8Decoder : public UTF8Base {
82 public:
83  UTF8Decoder(const uint8_t* utf8Begin, const uint8_t* utf8End);
84  virtual ~UTF8Decoder();
85 
87 
88 protected:
89  const uint8_t* utf8Begin;
90  const uint8_t* utf8End;
91 
92 public:
93  int32_t decode(wchar_t* unicode, int32_t length);
94 
95  int32_t utf8to16(wchar_t* unicode, int32_t length);
96  int32_t utf8to32(wchar_t* unicode, int32_t length);
97 
98 protected:
99  virtual uint32_t readNext();
100 
101  int32_t sequenceLength(uint32_t cp);
102  bool getSequence(uint32_t& cp, int32_t length);
103  bool isValidNext(uint32_t& cp);
104 };
105 
107 public:
110 
112 
113 protected:
115 
116 protected:
117  virtual uint32_t readNext();
118 };
119 
120 class UTF16Decoder : public UTF8Base {
121 public:
122  UTF16Decoder(const uint16_t* utf16Begin, const uint16_t* utf16End);
123  virtual ~UTF16Decoder();
124 
126 
127 protected:
128  const uint16_t* utf16Begin;
129  const uint16_t* utf16End;
130 
131 public:
132  int32_t decode(wchar_t* unicode, int32_t length);
133 
134  int32_t utf16to16(wchar_t* unicode, int32_t length);
135  int32_t utf16to32(wchar_t* unicode, int32_t length);
136 
137 protected:
138  virtual uint32_t readNext();
139 };
140 
141 }
142 
143 #endif
Lucene::UTF8Base::TRAIL_SURROGATE_MIN
static const uint16_t TRAIL_SURROGATE_MIN
Definition: UTF8Stream.h:22
Lucene::UTF8EncoderStream::reader
ReaderPtr reader
Definition: UTF8Stream.h:72
Lucene::UTF16Decoder::utf16to16
int32_t utf16to16(wchar_t *unicode, int32_t length)
Lucene::UTF8Decoder::utf8Begin
const uint8_t * utf8Begin
Definition: UTF8Stream.h:86
LUCENE_CLASS
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Lucene::UTF8DecoderStream
Definition: UTF8Stream.h:106
Lucene::UTF8Base::SURROGATE_OFFSET
static const uint32_t SURROGATE_OFFSET
Definition: UTF8Stream.h:25
Lucene::UTF8Base::isOverlongSequence
bool isOverlongSequence(uint32_t cp, int32_t length)
Lucene::UTF8Encoder::unicodeBegin
const wchar_t * unicodeBegin
Definition: UTF8Stream.h:49
Lucene::UTF8EncoderStream::readNext
virtual uint32_t readNext()
Lucene::UTF8Decoder::~UTF8Decoder
virtual ~UTF8Decoder()
Lucene::UTF8Decoder::sequenceLength
int32_t sequenceLength(uint32_t cp)
Lucene::ReaderPtr
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
Lucene::UTF8Base::CODE_POINT_MAX
static const uint32_t CODE_POINT_MAX
Definition: UTF8Stream.h:26
Lucene::UTF8Decoder::utf8to32
int32_t utf8to32(wchar_t *unicode, int32_t length)
Lucene::UTF8DecoderStream::reader
ReaderPtr reader
Definition: UTF8Stream.h:111
Lucene::UTF8Encoder::appendChar
uint8_t * appendChar(uint8_t *utf8, uint32_t cp)
Lucene::UTF8Encoder::utf16to8
int32_t utf16to8(uint8_t *utf8, int32_t length)
Lucene::UTF16Decoder::readNext
virtual uint32_t readNext()
Lucene
Definition: AbstractAllTermDocs.h:12
Lucene::UTF8Decoder::isValidNext
bool isValidNext(uint32_t &cp)
Lucene::UTF8Base::isSurrogate
bool isSurrogate(uint32_t cp)
Lucene::LuceneObject
Base class for all Lucene classes.
Definition: LuceneObject.h:31
Lucene::UTF8Encoder::readNext
virtual uint32_t readNext()
Lucene::UTF8Base::isTrail
bool isTrail(uint32_t b)
Lucene::UTF16Decoder::utf16End
const uint16_t * utf16End
Definition: UTF8Stream.h:129
Lucene::UTF8Base::isLeadSurrogate
bool isLeadSurrogate(uint32_t cp)
Lucene::UTF8Base::UNICODE_REPLACEMENT_CHAR
static const wchar_t UNICODE_REPLACEMENT_CHAR
Definition: UTF8Stream.h:28
Lucene::UTF8Base::readNext
virtual uint32_t readNext()=0
Lucene::UTF8Encoder::~UTF8Encoder
virtual ~UTF8Encoder()
Lucene::UTF8Decoder::getSequence
bool getSequence(uint32_t &cp, int32_t length)
Lucene::UTF8Base::mask8
uint8_t mask8(uint32_t b)
Lucene::UTF8Base::~UTF8Base
virtual ~UTF8Base()
Lucene::UTF8Base::UNICODE_TERMINATOR
static const wchar_t UNICODE_TERMINATOR
Definition: UTF8Stream.h:29
Lucene::UTF8Base::LEAD_SURROGATE_MIN
static const uint16_t LEAD_SURROGATE_MIN
Definition: UTF8Stream.h:17
Lucene::UTF16Decoder::UTF16Decoder
UTF16Decoder(const uint16_t *utf16Begin, const uint16_t *utf16End)
Lucene::UTF8Encoder::utf32to8
int32_t utf32to8(uint8_t *utf8, int32_t length)
Lucene::UTF8EncoderStream::UTF8EncoderStream
UTF8EncoderStream(const ReaderPtr &reader)
Lucene::UTF16Decoder::utf16to32
int32_t utf16to32(wchar_t *unicode, int32_t length)
Lucene::UTF8DecoderStream::UTF8DecoderStream
UTF8DecoderStream(const ReaderPtr &reader)
Lucene::UTF8EncoderStream::~UTF8EncoderStream
virtual ~UTF8EncoderStream()
Lucene::UTF8Base::LEAD_OFFSET
static const uint16_t LEAD_OFFSET
Definition: UTF8Stream.h:24
Lucene::UTF8Encoder::encode
int32_t encode(uint8_t *utf8, int32_t length)
Lucene::UTF8Decoder
Definition: UTF8Stream.h:81
Lucene::UTF8Decoder::utf8End
const uint8_t * utf8End
Definition: UTF8Stream.h:90
Lucene::UTF8DecoderStream::~UTF8DecoderStream
virtual ~UTF8DecoderStream()
Lucene::UTF8Decoder::readNext
virtual uint32_t readNext()
Lucene::UTF8EncoderStream
Definition: UTF8Stream.h:67
Lucene::UTF8Base::LEAD_SURROGATE_MAX
static const uint16_t LEAD_SURROGATE_MAX
Definition: UTF8Stream.h:21
Lucene::UTF8Decoder::utf8to16
int32_t utf8to16(wchar_t *unicode, int32_t length)
Lucene::UTF8Encoder
Definition: UTF8Stream.h:44
Lucene::UTF16Decoder::utf16Begin
const uint16_t * utf16Begin
Definition: UTF8Stream.h:125
Lucene::UTF8DecoderStream::readNext
virtual uint32_t readNext()
Lucene::UTF8Base::TRAIL_SURROGATE_MAX
static const uint16_t TRAIL_SURROGATE_MAX
Definition: UTF8Stream.h:23
Lucene::UTF8Base::mask16
uint16_t mask16(uint32_t c)
Lucene::UTF8Encoder::unicodeEnd
const wchar_t * unicodeEnd
Definition: UTF8Stream.h:53
Lucene::UTF8Encoder::UTF8Encoder
UTF8Encoder(const wchar_t *unicodeBegin, const wchar_t *unicodeEnd)
Lucene::UTF8Base
Definition: UTF8Stream.h:14
Lucene::UTF8Base::isTrailSurrogate
bool isTrailSurrogate(uint32_t cp)
Lucene::UTF16Decoder::decode
int32_t decode(wchar_t *unicode, int32_t length)
Lucene::UTF8Base::isValidCodePoint
bool isValidCodePoint(uint32_t cp)
Lucene::UTF8Decoder::UTF8Decoder
UTF8Decoder(const uint8_t *utf8Begin, const uint8_t *utf8End)
Lucene::UTF16Decoder
Definition: UTF8Stream.h:120
Lucene::UTF8Decoder::decode
int32_t decode(wchar_t *unicode, int32_t length)
LuceneObject.h
Lucene::UTF16Decoder::~UTF16Decoder
virtual ~UTF16Decoder()

clucene.sourceforge.net