Lucene++ - a full-featured, c++ search engine
API Documentation


CharTokenizer.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef CHARTOKENIZER_H
8 #define CHARTOKENIZER_H
9 
10 #include "Tokenizer.h"
11 
12 namespace Lucene {
13 
15 class LPPAPI CharTokenizer : public Tokenizer {
16 public:
17  CharTokenizer(const ReaderPtr& input);
18  CharTokenizer(const AttributeSourcePtr& source, const ReaderPtr& input);
19  CharTokenizer(const AttributeFactoryPtr& factory, const ReaderPtr& input);
20  virtual ~CharTokenizer();
21 
23 
24 protected:
25  int32_t offset;
26  int32_t bufferIndex;
27  int32_t dataLen;
28 
29  static const int32_t MAX_WORD_LEN;
30  static const int32_t IO_BUFFER_SIZE;
31 
32  CharArray ioBuffer;
35 
36 public:
37  virtual bool incrementToken();
38  virtual void end();
39  virtual void reset(const ReaderPtr& input);
40 
41 protected:
45  virtual bool isTokenChar(wchar_t c) = 0;
46 
49  virtual wchar_t normalize(wchar_t c);
50 };
51 
52 }
53 
54 #endif
Lucene::CharTokenizer::normalize
virtual wchar_t normalize(wchar_t c)
Called on each token character to normalize it before it is added to the token. The default implement...
LUCENE_CLASS
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Lucene::ReaderPtr
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
Lucene::CharTokenizer::offset
int32_t offset
Definition: CharTokenizer.h:22
Lucene::CharTokenizer::CharTokenizer
CharTokenizer(const AttributeFactoryPtr &factory, const ReaderPtr &input)
Lucene::CharTokenizer::CharTokenizer
CharTokenizer(const ReaderPtr &input)
Lucene::CharTokenizer::IO_BUFFER_SIZE
static const int32_t IO_BUFFER_SIZE
Definition: CharTokenizer.h:30
Lucene::CharTokenizer::offsetAtt
OffsetAttributePtr offsetAtt
Definition: CharTokenizer.h:34
Lucene::CharTokenizer::ioBuffer
CharArray ioBuffer
Definition: CharTokenizer.h:32
Lucene::CharTokenizer
An abstract base class for simple, character-oriented tokenizers.
Definition: CharTokenizer.h:15
Lucene::CharTokenizer::isTokenChar
virtual bool isTokenChar(wchar_t c)=0
Returns true if a character should be included in a token. This tokenizer generates as tokens adjacen...
Lucene
Definition: AbstractAllTermDocs.h:12
Lucene::CharTokenizer::dataLen
int32_t dataLen
Definition: CharTokenizer.h:27
Lucene::CharTokenizer::incrementToken
virtual bool incrementToken()
Consumers (ie., IndexWriter) use this method to advance the stream to the next token....
Lucene::CharTokenizer::CharTokenizer
CharTokenizer(const AttributeSourcePtr &source, const ReaderPtr &input)
Lucene::OffsetAttributePtr
boost::shared_ptr< OffsetAttribute > OffsetAttributePtr
Definition: LuceneTypes.h:40
Lucene::AttributeSourcePtr
boost::shared_ptr< AttributeSource > AttributeSourcePtr
Definition: LuceneTypes.h:520
Lucene::Tokenizer
A Tokenizer is a TokenStream whose input is a Reader.
Definition: Tokenizer.h:20
Lucene::CharTokenizer::MAX_WORD_LEN
static const int32_t MAX_WORD_LEN
Definition: CharTokenizer.h:29
Lucene::TermAttributePtr
boost::shared_ptr< TermAttribute > TermAttributePtr
Definition: LuceneTypes.h:58
Lucene::CharTokenizer::termAtt
TermAttributePtr termAtt
Definition: CharTokenizer.h:33
Lucene::CharTokenizer::end
virtual void end()
This method is called by the consumer after the last token has been consumed, after incrementToken() ...
Lucene::CharTokenizer::bufferIndex
int32_t bufferIndex
Definition: CharTokenizer.h:26
Lucene::CharTokenizer::reset
virtual void reset(const ReaderPtr &input)
Reset the tokenizer to a new reader. Typically, an analyzer (in its reusableTokenStream method) will ...
Tokenizer.h
Lucene::CharTokenizer::~CharTokenizer
virtual ~CharTokenizer()
Lucene::AttributeFactoryPtr
boost::shared_ptr< AttributeFactory > AttributeFactoryPtr
Definition: LuceneTypes.h:519

clucene.sourceforge.net