diff --git a/pom.xml b/pom.xml
index 401f248..8dcf0d0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
- 0.20.2
+ 0.90.0
@@ -132,4 +132,4 @@
-
\ No newline at end of file
+
diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java
index 404e8e3..ffdf0d6 100644
--- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java
@@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer;
+//import org.wltea.lucene.IKTokenizer;
import java.io.Reader;
public class IkAnalyzer extends Analyzer {
-
- @Override public TokenStream tokenStream(String fieldName, Reader reader) {
- return new IKTokenizer(reader,true);
- }
+// private boolean isMaxWordLength = false;
+// @Override public TokenStream tokenStream(String fieldName, Reader reader) {
+// return new IKTokenizer(reader,true);
+// }
public IkAnalyzer() {
super();
}
+
+ @Override
+ protected TokenStreamComponents createComponents(String s, Reader reader) {
+// new TokenStreamComponents
+ Tokenizer tokenizer = new IKTokenizer(reader, true);
+ return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates.
+ }
+
+// public boolean isMaxWordLength() {
+// return isMaxWordLength;
+// }
}
diff --git a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
index 288ec40..7ca75a9 100644
--- a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
+++ b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
@@ -24,11 +24,16 @@
*/
package org.wltea.analyzer.core;
-import org.wltea.analyzer.dic.Dictionary;
-
import java.io.IOException;
import java.io.Reader;
-import java.util.*;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Set;
+
+import org.wltea.analyzer.cfg.Configuration;
+import org.wltea.analyzer.dic.Dictionary;
/**
*
@@ -68,12 +73,12 @@ class AnalyzeContext {
private Map pathMap;
//最终分词结果集
private LinkedList results;
-
+ private boolean useSmart;
//分词器配置项
- private boolean useSmart;
-
+// private Configuration cfg;
+
public AnalyzeContext(boolean useSmart){
- this.useSmart = useSmart;
+ this.useSmart = useSmart;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet();
@@ -313,7 +318,7 @@ class AnalyzeContext {
while(result != null){
//数量词合并
this.compound(result);
- if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
+ if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个
result = this.results.pollFirst();
}else{
@@ -344,6 +349,7 @@ class AnalyzeContext {
* 组合词元
*/
private void compound(Lexeme result){
+
if(!this.useSmart){
return ;
}
diff --git a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
index 86b1c8c..5867ff5 100644
--- a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
+++ b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
@@ -25,12 +25,12 @@
*/
package org.wltea.analyzer.core;
-import org.wltea.analyzer.dic.Dictionary;
-import org.wltea.analyzer.dic.Hit;
-
import java.util.LinkedList;
import java.util.List;
+import org.wltea.analyzer.dic.Dictionary;
+import org.wltea.analyzer.dic.Hit;
+
/**
* 中文-日韩文子分词器
@@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){
- hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
+ hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
@@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//*********************************
//再对当前指针位置的字符进行单字匹配
- Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
+ Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
diff --git a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
index 50ed33a..b987a3a 100644
--- a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
+++ b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
@@ -24,14 +24,14 @@
*/
package org.wltea.analyzer.core;
-import org.wltea.analyzer.dic.Dictionary;
-import org.wltea.analyzer.dic.Hit;
-
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
+import org.wltea.analyzer.dic.Dictionary;
+import org.wltea.analyzer.dic.Hit;
+
/**
*
* 中文数量词子分词器
@@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){
- hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
+ hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
@@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//*********************************
//对当前指针位置的字符进行单字匹配
- Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
+ Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
diff --git a/src/main/java/org/wltea/analyzer/core/IKArbitrator.java b/src/main/java/org/wltea/analyzer/core/IKArbitrator.java
index e15647b..18af1bd 100644
--- a/src/main/java/org/wltea/analyzer/core/IKArbitrator.java
+++ b/src/main/java/org/wltea/analyzer/core/IKArbitrator.java
@@ -38,7 +38,7 @@ class IKArbitrator {
/**
* 分词歧义处理
- * @param orgLexemes
+// * @param orgLexemes
* @param useSmart
*/
void process(AnalyzeContext context , boolean useSmart){
@@ -87,7 +87,6 @@ class IKArbitrator {
* 歧义识别
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
- * @param option 候选结果路径
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
@@ -120,7 +119,7 @@ class IKArbitrator {
/**
* 向前遍历,添加词元,构造一个无歧义词元组合
- * @param LexemePath path
+// * @param LexemePath path
* @return
*/
private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
@@ -140,7 +139,7 @@ class IKArbitrator {
/**
* 回滚词元链,直到它能够接受指定的词元
- * @param lexeme
+// * @param lexeme
* @param l
*/
private void backPath(Lexeme l , LexemePath option){
diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java
index 3548192..aa20452 100644
--- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java
+++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java
@@ -23,14 +23,15 @@
*/
package org.wltea.analyzer.core;
-import org.elasticsearch.common.logging.ESLogger;
-import org.elasticsearch.common.logging.Loggers;
-
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
+import org.wltea.analyzer.cfg.Configuration;
+//import org.wltea.analyzer.cfg.DefaultConfig;
+import org.wltea.analyzer.dic.Dictionary;
+
/**
* IK分词器主类
*
@@ -39,16 +40,18 @@ public final class IKSegmenter {
//字符窜reader
private Reader input;
+ //分词器配置项
+ private Configuration cfg;
//分词器上下文
private AnalyzeContext context;
//分词处理器列表
private List segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
- private ESLogger logger=null;
- private final boolean useSmart;
+ private boolean useSmart = false;
+
- /**
+ /**
* IK分词器构造函数
* @param input
* @param useSmart 为true,使用智能分词策略
@@ -57,16 +60,31 @@ public final class IKSegmenter {
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
*/
public IKSegmenter(Reader input , boolean useSmart){
- logger = Loggers.getLogger("ik-analyzer");
this.input = input;
+// this.cfg = DefaultConfig.getInstance();
this.useSmart=useSmart;
- this.init();
+ this.init();
+ }
+
+ /**
+ * IK分词器构造函数
+ * @param input
+ * @param cfg 使用自定义的Configuration构造分词器
+ *
+ */
+ public IKSegmenter(Reader input , Configuration cfg){
+ this.input = input;
+ this.cfg = cfg;
+ this.init();
}
/**
* 初始化
*/
private void init(){
+ //初始化词典单例
+// Dictionary.initial(this.cfg);
+// Dictionary.getSingleton();
//初始化分词上下文
this.context = new AnalyzeContext(useSmart);
//加载子分词器
diff --git a/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java b/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java
index feb7f36..d239e91 100644
--- a/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java
+++ b/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java
@@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
/**
* 处理数字字母混合输出
* 如:windos2000 | linliangyi2005@gmail.com
- * @param input
+// * @param input
* @param context
* @return
*/
diff --git a/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/src/main/java/org/wltea/analyzer/dic/DictSegment.java
index ecbcd9c..c34c5e2 100644
--- a/src/main/java/org/wltea/analyzer/dic/DictSegment.java
+++ b/src/main/java/org/wltea/analyzer/dic/DictSegment.java
@@ -326,13 +326,5 @@ class DictSegment implements Comparable{
//对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar);
}
-
- public int getDicNum(){
- if(charMap!=null)
- {
- return charMap.size();
- }
- return 0;
- }
-
+
}
diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java
index c02c5b7..36ea8e3 100644
--- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java
+++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java
@@ -1,74 +1,233 @@
/**
+ * IK 中文分词 版本 5.0
+ * IK Analyzer release 5.0
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ *
*
*/
package org.wltea.analyzer.dic;
+import java.io.*;
+import java.util.Collection;
+import java.util.List;
+
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
-import java.io.*;
-import java.util.Collection;
-import java.util.List;
-
+/**
+ * 词典管理类,单子模式
+ */
public class Dictionary {
- public static final String PATH_DIC_MAIN = "ik/main.dic";
- public static final String PATH_DIC_SURNAME = "ik/surname.dic";
- public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
- public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
- public static final String PATH_DIC_PREP = "ik/preposition.dic";
- public static final String PATH_DIC_STOP = "ik/stopword.dic";
- private static final Dictionary singleton;
-
- static{
- singleton = new Dictionary();
- }
+ /*
+ * 词典单子实例
+ */
+ private static Dictionary singleton;
+
+ /*
+ * 主词典对象
+ */
private DictSegment _MainDict;
-
- private DictSegment _SurnameDict;
-
+
+ /*
+ * 停止词词典
+ */
+ private DictSegment _StopWordDict;
+ /*
+ * 量词词典
+ */
private DictSegment _QuantifierDict;
-
- private DictSegment _SuffixDict;
-
- private DictSegment _PrepDict;
-
- private DictSegment _StopWords;
-
- private Environment environment;
- private Configuration configuration;
+
+ /**
+ * 配置对象
+ */
+ private Configuration configuration;
private ESLogger logger=null;
private static boolean dictInited=false;
- private Dictionary(){
+ private Environment environment;
+ public static final String PATH_DIC_MAIN = "ik/main.dic";
+ public static final String PATH_DIC_SURNAME = "ik/surname.dic";
+ public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
+ public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
+ public static final String PATH_DIC_PREP = "ik/preposition.dic";
+ public static final String PATH_DIC_STOP = "ik/stopword.dic";
+ private Dictionary(){
logger = Loggers.getLogger("ik-analyzer");
- }
-
- public Configuration getConfig(){
- return configuration;
- }
+ }
+ static{
+ singleton = new Dictionary();
+ }
+// public Configuration getConfig(){
+// return configuration;
+// }
+// private Dictionary(Configuration cfg){
+// this.cfg = cfg;
+// this.loadMainDict();
+// this.loadStopWordDict();
+// this.loadQuantifierDict();
+// }
public void Init(Settings indexSettings){
- if(!dictInited){
- environment =new Environment(indexSettings);
- configuration=new Configuration(indexSettings);
- loadMainDict();
- loadSurnameDict();
- loadQuantifierDict();
- loadSuffixDict();
- loadPrepDict();
- loadStopWordDict();
- dictInited=true;
- }
+ if(!dictInited){
+ environment =new Environment(indexSettings);
+ configuration=new Configuration(indexSettings);
+ loadMainDict();
+// loadSurnameDict();
+ loadQuantifierDict();
+// loadSuffixDict();
+// loadPrepDict();
+ loadStopWordDict();
+ dictInited=true;
+ }
}
+ /**
+ * 词典初始化
+ * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
+ * 只有当Dictionary类被实际调用时,才会开始载入词典,
+ * 这将延长首次分词操作的时间
+ * 该方法提供了一个在应用加载阶段就初始化字典的手段
+ * @return Dictionary
+ */
+// public static Dictionary initial(Configuration cfg){
+// if(singleton == null){
+// synchronized(Dictionary.class){
+// if(singleton == null){
+// singleton = new Dictionary();
+// return singleton;
+// }
+// }
+// }
+// return singleton;
+// }
+
+ /**
+ * 获取词典单子实例
+ * @return Dictionary 单例对象
+ */
+ public static Dictionary getSingleton(){
+ if(singleton == null){
+ throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
+ }
+ return singleton;
+ }
+
+ /**
+ * 批量加载新词条
+ * @param words Collection词条列表
+ */
+ public void addWords(Collection words){
+ if(words != null){
+ for(String word : words){
+ if (word != null) {
+ //批量加载词条到主内存词典中
+ singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
+ }
+ }
+ }
+ }
+
+ /**
+ * 批量移除(屏蔽)词条
+ * @param words
+ */
+ public void disableWords(Collection words){
+ if(words != null){
+ for(String word : words){
+ if (word != null) {
+ //批量屏蔽词条
+ singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
+ }
+ }
+ }
+ }
+
+ /**
+ * 检索匹配主词典
+ * @param charArray
+ * @return Hit 匹配结果描述
+ */
+ public Hit matchInMainDict(char[] charArray){
+ return singleton._MainDict.match(charArray);
+ }
+
+ /**
+ * 检索匹配主词典
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return Hit 匹配结果描述
+ */
+ public Hit matchInMainDict(char[] charArray , int begin, int length){
+ return singleton._MainDict.match(charArray, begin, length);
+ }
+
+ /**
+ * 检索匹配量词词典
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return Hit 匹配结果描述
+ */
+ public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
+ return singleton._QuantifierDict.match(charArray, begin, length);
+ }
+
+
+ /**
+ * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
+ * @param charArray
+ * @param currentIndex
+ * @param matchedHit
+ * @return Hit
+ */
+ public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
+ DictSegment ds = matchedHit.getMatchedDictSegment();
+ return ds.match(charArray, currentIndex, 1 , matchedHit);
+ }
+
+
+ /**
+ * 判断是否是停止词
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return boolean
+ */
+ public boolean isStopWord(char[] charArray , int begin, int length){
+ return singleton._StopWordDict.match(charArray, begin, length).isMatch();
+ }
+
+ /**
+ * 加载主词典及扩展词典
+ */
private void loadMainDict(){
+ //建立一个主词典实例
_MainDict = new DictSegment((char)0);
-
+ //读取主词典文件
File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN);
InputStream is = null;
@@ -77,24 +236,21 @@ public class Dictionary {
} catch (FileNotFoundException e) {
e.printStackTrace();
}
- if(is == null){
- throw new RuntimeException("Main Dictionary not found!!!");
- }
-
+
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
+ String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
- _MainDict.fillSegment(theWord.trim().toCharArray());
+ _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
- logger.info("[Dict Loading] {},MainDict Size:{}",file.toString(),_MainDict.getDicNum());
+
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
-
+
}finally{
try {
if(is != null){
@@ -105,41 +261,42 @@ public class Dictionary {
e.printStackTrace();
}
}
-
-
+ //加载扩展词典
+ this.loadExtDict();
+ }
+
+ /**
+ * 加载用户配置的扩展词典到主词库表
+ */
+ private void loadExtDict(){
+ //加载扩展词典配置
List extDictFiles = configuration.getExtDictionarys();
if(extDictFiles != null){
+ InputStream is = null;
for(String extDictName : extDictFiles){
-
- File tempFile=new File(environment.configFile(),extDictName);
-
- try {
- is = new FileInputStream(tempFile);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- logger.error("[Dict Loading]",e);
- }
-
- if(is == null){
+ //读取扩展词典文件
+ System.out.println("加载扩展词典:" + extDictName);
+ is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
+ //如果找不到扩展的字典,则忽略
+ if(is == null){
continue;
}
try {
-
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
+ String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
-
-
+ //加载扩展词典数据到主内存词典中
+ //System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
- logger.info("[Dict Loading] {},MainDict Size:{}",tempFile.toString(),_MainDict.getDicNum());
+
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
-
+
}finally{
try {
if(is != null){
@@ -151,77 +308,85 @@ public class Dictionary {
}
}
}
- }
+ }
}
-
-
- private void loadSurnameDict(){
-
- _SurnameDict = new DictSegment((char)0);
- File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- if(is == null){
- throw new RuntimeException("Surname Dictionary not found!!!");
- }
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- _SurnameDict.fillSegment(theWord.trim().toCharArray());
+
+ /**
+ * 加载用户扩展的停止词词典
+ */
+ private void loadStopWordDict(){
+ //建立一个主词典实例
+ _StopWordDict = new DictSegment((char)0);
+ //加载扩展停止词典
+ List extStopWordDictFiles = configuration.getExtStopWordDictionarys();
+ if(extStopWordDictFiles != null){
+ InputStream is = null;
+ for(String extStopWordDictName : extStopWordDictFiles){
+ System.out.println("加载扩展停止词典:" + extStopWordDictName);
+ //读取扩展词典文件
+ is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
+ //如果找不到扩展的字典,则忽略
+ if(is == null){
+ continue;
}
- } while (theWord != null);
- logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum());
- } catch (IOException ioe) {
- System.err.println("Surname Dictionary loading exception.");
- ioe.printStackTrace();
-
- }finally{
- try {
- if(is != null){
- is.close();
- is = null;
+ try {
+ BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
+ String theWord = null;
+ do {
+ theWord = br.readLine();
+ if (theWord != null && !"".equals(theWord.trim())) {
+ //System.out.println(theWord);
+ //加载扩展停止词典数据到内存中
+ _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ } while (theWord != null);
+
+ } catch (IOException ioe) {
+ System.err.println("Extension Stop word Dictionary loading exception.");
+ ioe.printStackTrace();
+
+ }finally{
+ try {
+ if(is != null){
+ is.close();
+ is = null;
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
- } catch (IOException e) {
- e.printStackTrace();
}
- }
+ }
}
-
-
+
+ /**
+ * 加载量词词典
+ */
private void loadQuantifierDict(){
-
+ //建立一个量词典实例
_QuantifierDict = new DictSegment((char)0);
+ //读取量词词典文件
File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
- }
- if(is == null){
- throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
+ String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
- _QuantifierDict.fillSegment(theWord.trim().toCharArray());
+ _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
- logger.info("[Dict Loading] {},QuantifierDict Size:{}",file.toString(),_QuantifierDict.getDicNum());
+
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
-
+
}finally{
try {
if(is != null){
@@ -235,304 +400,8 @@ public class Dictionary {
}
- private void loadSuffixDict(){
-
- _SuffixDict = new DictSegment((char)0);
- File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- if(is == null){
- throw new RuntimeException("Suffix Dictionary not found!!!");
- }
- try {
-
- BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- _SuffixDict.fillSegment(theWord.trim().toCharArray());
- }
- } while (theWord != null);
- logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
- } catch (IOException ioe) {
- System.err.println("Suffix Dictionary loading exception.");
- ioe.printStackTrace();
-
- }finally{
- try {
- if(is != null){
- is.close();
- is = null;
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
-
-
- private void loadPrepDict(){
-
- _PrepDict = new DictSegment((char)0);
- File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- if(is == null){
- throw new RuntimeException("Preposition Dictionary not found!!!");
- }
- try {
-
- BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
-
- _PrepDict.fillSegment(theWord.trim().toCharArray());
- }
- } while (theWord != null);
- logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
- } catch (IOException ioe) {
- System.err.println("Preposition Dictionary loading exception.");
- ioe.printStackTrace();
-
- }finally{
- try {
- if(is != null){
- is.close();
- is = null;
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
-
-
- private void loadStopWordDict(){
-
- _StopWords = new DictSegment((char)0);
- File file=new File(environment.configFile(),Dictionary.PATH_DIC_STOP);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- if(is == null){
- throw new RuntimeException("Stopword Dictionary not found!!!");
- }
- try {
-
- BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- _StopWords.fillSegment(theWord.trim().toCharArray());
- }
- } while (theWord != null);
- logger.info("[Dict Loading] {},Stopwords Size:{}",file.toString(),_StopWords.getDicNum());
- } catch (IOException ioe) {
- System.err.println("Stopword Dictionary loading exception.");
- ioe.printStackTrace();
-
- }finally{
- try {
- if(is != null){
- is.close();
- is = null;
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
-
- List extStopWordDictFiles = configuration.getExtStopWordDictionarys();
- if(extStopWordDictFiles != null){
- for(String extStopWordDictName : extStopWordDictFiles){
- File tempFile=new File(environment.configFile(),extStopWordDictName);
- try {
- is = new FileInputStream(tempFile);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
-
- if(is == null){
- continue;
- }
- try {
-
- BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
-
-
- _StopWords.fillSegment(theWord.trim().toCharArray());
- }
- } while (theWord != null);
- logger.info("[Dict Loading] {},Stopwords Size:{}",tempFile.toString(),_StopWords.getDicNum());
- } catch (IOException ioe) {
- System.err.println("Extension Stop word Dictionary loading exception.");
- ioe.printStackTrace();
-
- }finally{
- try {
- if(is != null){
- is.close();
- is = null;
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- }
-
- public static Dictionary getInstance(){
- return Dictionary.singleton;
- }
-
- public static void loadExtendWords(Collection extWords){
- if(extWords != null){
- for(String extWord : extWords){
- if (extWord != null) {
-
- singleton._MainDict.fillSegment(extWord.trim().toCharArray());
- }
- }
- }
- }
-
-
- public static void loadExtendStopWords(Collection extStopWords){
- if(extStopWords != null){
- for(String extStopWord : extStopWords){
- if (extStopWord != null) {
-
- singleton._StopWords.fillSegment(extStopWord.trim().toCharArray());
- }
- }
- }
- }
-
-
- public static Hit matchInMainDict(char[] charArray){
- return singleton._MainDict.match(charArray);
- }
-
-
- public static Hit matchInMainDict(char[] charArray , int begin, int length){
- return singleton._MainDict.match(charArray, begin, length);
- }
-
-
- public static Hit matchInMainDictWithHit(char[] charArray , int currentIndex , Hit matchedHit){
- DictSegment ds = matchedHit.getMatchedDictSegment();
- return ds.match(charArray, currentIndex, 1 , matchedHit);
- }
-
+ public static Dictionary getInstance(){
+ return Dictionary.singleton;
+ }
- public static Hit matchInSurnameDict(char[] charArray , int begin, int length){
- return singleton._SurnameDict.match(charArray, begin, length);
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- /**
- * 检索匹配量词词典
- * @param charArray
- * @param begin
- * @param length
- * @return Hit 匹配结果描述
- */
- public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){
- return singleton._QuantifierDict.match(charArray, begin, length);
- }
-
- /**
- * 检索匹配在后缀词典
- * @param charArray
- * @param begin
- * @param length
- * @return Hit 匹配结果描述
- */
- public static Hit matchInSuffixDict(char[] charArray , int begin, int length){
- return singleton._SuffixDict.match(charArray, begin, length);
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- /**
- * 检索匹配介词、副词词典
- * @param charArray
- * @param begin
- * @param length
- * @return Hit 匹配结果描述
- */
- public static Hit matchInPrepDict(char[] charArray , int begin, int length){
- return singleton._PrepDict.match(charArray, begin, length);
- }
-
- /**
- * 判断是否是停止词
- * @param charArray
- * @param begin
- * @param length
- * @return boolean
- */
- public static boolean isStopWord(char[] charArray , int begin, int length){
- return singleton._StopWords.match(charArray, begin, length).isMatch();
- }
}
diff --git a/src/main/java/org/wltea/analyzer/dic/Hit.java b/src/main/java/org/wltea/analyzer/dic/Hit.java
index b5110bd..cdfd0e5 100644
--- a/src/main/java/org/wltea/analyzer/dic/Hit.java
+++ b/src/main/java/org/wltea/analyzer/dic/Hit.java
@@ -58,7 +58,9 @@ public class Hit {
public boolean isMatch() {
return (this.hitState & MATCH) > 0;
}
-
+ /**
+ *
+ */
public void setMatch() {
this.hitState = this.hitState | MATCH;
}
@@ -69,7 +71,9 @@ public class Hit {
public boolean isPrefix() {
return (this.hitState & PREFIX) > 0;
}
-
+ /**
+ *
+ */
public void setPrefix() {
this.hitState = this.hitState | PREFIX;
}
@@ -79,7 +83,9 @@ public class Hit {
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
-
+ /**
+ *
+ */
public void setUnmatch() {
this.hitState = UNMATCH;
}
diff --git a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
index 665954d..1dd15d5 100644
--- a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
+++ b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
@@ -1,51 +1,87 @@
/**
+ * IK 中文分词 版本 5.0.1
+ * IK Analyzer release 5.0.1
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.lucene;
+import java.io.Reader;
+
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.dic.Dictionary;
-import java.io.Reader;
-
-public final class IKAnalyzer extends Analyzer {
+/**
+ * IK分词器,Lucene Analyzer接口实现
+ * 兼容Lucene 4.0版本
+ */
+public final class IKAnalyzer extends Analyzer{
- private boolean isMaxWordLength = false;
- private boolean useSmart=false;
+ private boolean useSmart;
+
+ public boolean useSmart() {
+ return useSmart;
+ }
- public IKAnalyzer(){
+ public void setUseSmart(boolean useSmart) {
+ this.useSmart = useSmart;
+ }
+
+ /**
+ * IK分词器Lucene Analyzer接口实现类
+ *
+ * 默认细粒度切分算法
+ */
+ public IKAnalyzer(){
this(false);
}
-
- public IKAnalyzer(boolean isMaxWordLength){
+ /**
+ * IK分词器Lucene Analyzer接口实现类
+ *
+ * @param useSmart 当为true时,分词器进行智能切分
+ */
+ public IKAnalyzer(boolean useSmart){
super();
- this.setMaxWordLength(isMaxWordLength);
+ this.useSmart = useSmart;
}
public IKAnalyzer(Settings indexSetting,Settings settings1) {
super();
- Dictionary.getInstance().Init(indexSetting);
+ Dictionary.getInstance().Init(indexSetting);
if(settings1.get("use_smart", "true").equals("true")){
- useSmart=true;
+ useSmart = true;
}
}
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new IKTokenizer(reader , useSmart);
- }
-
- public void setMaxWordLength(boolean isMaxWordLength) {
- this.isMaxWordLength = isMaxWordLength;
- }
-
- public boolean isMaxWordLength() {
- return isMaxWordLength;
+ /**
+ * 重载Analyzer接口,构造分词组件
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
+ Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
+ return new TokenStreamComponents(_IKTokenizer);
}
}
diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
index ffd5f02..846e4f1 100644
--- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
+++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
@@ -1,7 +1,7 @@
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -20,94 +20,95 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
+ *
- *
+ *
*/
package org.wltea.analyzer.lucene;
+import java.io.IOException;
+import java.io.Reader;
+
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
-import java.io.IOException;
-import java.io.Reader;
-
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
public final class IKTokenizer extends Tokenizer {
+
+ //IK分词器实现
+ private IKSegmenter _IKImplement;
+
+ //词元文本属性
+ private final CharTermAttribute termAtt;
+ //词元位移属性
+ private final OffsetAttribute offsetAtt;
+ //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
+ private final TypeAttribute typeAtt;
+ //记录最后一个词元的结束位置
+ private int endPosition;
+
+ /**
+ * Lucene 4.0 Tokenizer适配器类构造函数
+ * @param in
+ * @param useSmart
+ */
+ public IKTokenizer(Reader in , boolean useSmart){
+ super(in);
+ offsetAtt = addAttribute(OffsetAttribute.class);
+ termAtt = addAttribute(CharTermAttribute.class);
+ typeAtt = addAttribute(TypeAttribute.class);
+ _IKImplement = new IKSegmenter(input , useSmart);
+ }
- //IK分词器实现
- private IKSegmenter _IKImplement;
-
- //词元文本属性
- private final CharTermAttribute termAtt;
- //词元位移属性
- private final OffsetAttribute offsetAtt;
- //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
- private final TypeAttribute typeAtt;
- //记录最后一个词元的结束位置
- private int endPosition;
-
- /**
- * Lucene 4.0 Tokenizer适配器类构造函数
- * @param in
- * @param useSmart
- */
- public IKTokenizer(Reader in , boolean useSmart){
- super(in);
- offsetAtt = addAttribute(OffsetAttribute.class);
- termAtt = addAttribute(CharTermAttribute.class);
- typeAtt = addAttribute(TypeAttribute.class);
- _IKImplement = new IKSegmenter(input , useSmart);
- }
-
- /* (non-Javadoc)
- * @see org.apache.lucene.analysis.TokenStream#incrementToken()
- */
- @Override
- public boolean incrementToken() throws IOException {
- //清除所有的词元属性
- clearAttributes();
- Lexeme nextLexeme = _IKImplement.next();
- if(nextLexeme != null){
- //将Lexeme转成Attributes
- //设置词元文本
- termAtt.append(nextLexeme.getLexemeText());
- //设置词元长度
- termAtt.setLength(nextLexeme.getLength());
- //设置词元位移
- offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
- //记录分词的最后位置
- endPosition = nextLexeme.getEndPosition();
- //记录词元分类
- typeAtt.setType(nextLexeme.getLexemeTypeString());
- //返会true告知还有下个词元
- return true;
- }
- //返会false告知词元输出完毕
- return false;
- }
-
- /*
- * (non-Javadoc)
- * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
- */
- @Override
- public void reset() throws IOException {
- super.reset();
- _IKImplement.reset(input);
- }
-
- @Override
- public final void end() {
- // set final offset
- int finalOffset = correctOffset(this.endPosition);
- offsetAtt.setOffset(finalOffset, finalOffset);
- }
+ /* (non-Javadoc)
+ * @see org.apache.lucene.analysis.TokenStream#incrementToken()
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ //清除所有的词元属性
+ clearAttributes();
+ Lexeme nextLexeme = _IKImplement.next();
+ if(nextLexeme != null){
+ //将Lexeme转成Attributes
+ //设置词元文本
+ termAtt.append(nextLexeme.getLexemeText());
+ //设置词元长度
+ termAtt.setLength(nextLexeme.getLength());
+ //设置词元位移
+ offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
+ //记录分词的最后位置
+ endPosition = nextLexeme.getEndPosition();
+ //记录词元分类
+ typeAtt.setType(nextLexeme.getLexemeTypeString());
+ //返会true告知还有下个词元
+ return true;
+ }
+ //返会false告知词元输出完毕
+ return false;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ _IKImplement.reset(input);
+ }
+
+ @Override
+ public final void end() {
+ // set final offset
+ int finalOffset = correctOffset(this.endPosition);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
}
diff --git a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java
index 63b730b..1b86a35 100644
--- a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java
+++ b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java
@@ -1,716 +1,716 @@
-///**
-// * IK 中文分词 版本 5.0
-// * IK Analyzer release 5.0
-// *
-// * Licensed to the Apache Software Foundation (ASF) under one or more
-// * contributor license agreements. See the NOTICE file distributed with
-// * this work for additional information regarding copyright ownership.
-// * The ASF licenses this file to You under the Apache License, Version 2.0
-// * (the "License"); you may not use this file except in compliance with
-// * the License. You may obtain a copy of the License at
-// *
-// * http://www.apache.org/licenses/LICENSE-2.0
-// *
-// * Unless required by applicable law or agreed to in writing, software
-// * distributed under the License is distributed on an "AS IS" BASIS,
-// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// * See the License for the specific language governing permissions and
-// * limitations under the License.
-// *
-// * 源代码由林良益(linliangyi2005@gmail.com)提供
-// * 版权声明 2012,乌龙茶工作室
-// * provided by Linliangyi and copyright 2012 by Oolong studio
-// *
-// */
-//package org.wltea.analyzer.query;
-//
-//import java.util.ArrayList;
-//import java.util.LinkedList;
-//import java.util.List;
-//import java.util.Stack;
-//
-//import org.apache.lucene.index.Term;
-//import org.apache.lucene.search.BooleanClause;
-//import org.apache.lucene.search.BooleanQuery;
-//import org.apache.lucene.search.Query;
-//import org.apache.lucene.search.TermQuery;
-//import org.apache.lucene.search.TermRangeQuery;
-//import org.apache.lucene.search.BooleanClause.Occur;
-//import org.apache.lucene.util.BytesRef;
-//
-///**
-// * IK简易查询表达式解析
-// * 结合SWMCQuery算法
-// *
-// * 表达式例子 :
-// * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
-// * @author linliangyi
-// *
-// */
-//public class IKQueryExpressionParser {
-//
-// //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
-//
-// private List elements = new ArrayList();
-//
-// private Stack querys = new Stack();
-//
-// private Stack operates = new Stack();
-//
-// /**
-// * 解析查询表达式,生成Lucene Query对象
-// *
-// * @param expression
-// * @param quickMode
-// * @return Lucene query
-// */
-// public Query parseExp(String expression , boolean quickMode){
-// Query lucenceQuery = null;
-// if(expression != null && !"".equals(expression.trim())){
-// try{
-// //文法解析
-// this.splitElements(expression);
-// //语法解析
-// this.parseSyntax(quickMode);
-// if(this.querys.size() == 1){
-// lucenceQuery = this.querys.pop();
-// }else{
-// throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
-// }
-// }finally{
-// elements.clear();
-// querys.clear();
-// operates.clear();
-// }
-// }
-// return lucenceQuery;
-// }
-//
-// /**
-// * 表达式文法解析
-// * @param expression
-// */
-// private void splitElements(String expression){
-//
-// if(expression == null){
-// return;
-// }
-// Element curretElement = null;
-//
-// char[] expChars = expression.toCharArray();
-// for(int i = 0 ; i < expChars.length ; i++){
-// switch(expChars[i]){
-// case '&' :
-// if(curretElement == null){
-// curretElement = new Element();
-// curretElement.type = '&';
-// curretElement.append(expChars[i]);
-// }else if(curretElement.type == '&'){
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// }else if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// }else {
-// this.elements.add(curretElement);
-// curretElement = new Element();
-// curretElement.type = '&';
-// curretElement.append(expChars[i]);
-// }
-// break;
-//
-// case '|' :
-// if(curretElement == null){
-// curretElement = new Element();
-// curretElement.type = '|';
-// curretElement.append(expChars[i]);
-// }else if(curretElement.type == '|'){
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// }else if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// }else {
-// this.elements.add(curretElement);
-// curretElement = new Element();
-// curretElement.type = '|';
-// curretElement.append(expChars[i]);
-// }
-// break;
-//
-// case '-' :
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = '-';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case '(' :
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = '(';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case ')' :
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = ')';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case ':' :
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = ':';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case '=' :
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = '=';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case ' ' :
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// }else{
-// this.elements.add(curretElement);
-// curretElement = null;
-// }
-// }
-//
-// break;
-//
-// case '\'' :
-// if(curretElement == null){
-// curretElement = new Element();
-// curretElement.type = '\'';
-//
-// }else if(curretElement.type == '\''){
-// this.elements.add(curretElement);
-// curretElement = null;
-//
-// }else{
-// this.elements.add(curretElement);
-// curretElement = new Element();
-// curretElement.type = '\'';
-//
-// }
-// break;
-//
-// case '[':
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = '[';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case ']':
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = ']';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-//
-// break;
-//
-// case '{':
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = '{';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-// break;
-//
-// case '}':
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = '}';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-//
-// break;
-// case ',':
-// if(curretElement != null){
-// if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-// continue;
-// }else{
-// this.elements.add(curretElement);
-// }
-// }
-// curretElement = new Element();
-// curretElement.type = ',';
-// curretElement.append(expChars[i]);
-// this.elements.add(curretElement);
-// curretElement = null;
-//
-// break;
-//
-// default :
-// if(curretElement == null){
-// curretElement = new Element();
-// curretElement.type = 'F';
-// curretElement.append(expChars[i]);
-//
-// }else if(curretElement.type == 'F'){
-// curretElement.append(expChars[i]);
-//
-// }else if(curretElement.type == '\''){
-// curretElement.append(expChars[i]);
-//
-// }else{
-// this.elements.add(curretElement);
-// curretElement = new Element();
-// curretElement.type = 'F';
-// curretElement.append(expChars[i]);
-// }
-// }
-// }
-//
-// if(curretElement != null){
-// this.elements.add(curretElement);
-// curretElement = null;
-// }
-// }
-//
-// /**
-// * 语法解析
-// *
-// */
-// private void parseSyntax(boolean quickMode){
-// for(int i = 0 ; i < this.elements.size() ; i++){
-// Element e = this.elements.get(i);
-// if('F' == e.type){
-// Element e2 = this.elements.get(i + 1);
-// if('=' != e2.type && ':' != e2.type){
-// throw new IllegalStateException("表达式异常: = 或 : 号丢失");
-// }
-// Element e3 = this.elements.get(i + 2);
-// //处理 = 和 : 运算
-// if('\'' == e3.type){
-// i+=2;
-// if('=' == e2.type){
-// TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
-// this.querys.push(tQuery);
-// }else if(':' == e2.type){
-// String keyword = e3.toString();
-// //SWMCQuery Here
-// Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode);
-// this.querys.push(_SWMCQuery);
-// }
-//
-// }else if('[' == e3.type || '{' == e3.type){
-// i+=2;
-// //处理 [] 和 {}
-// LinkedList eQueue = new LinkedList();
-// eQueue.add(e3);
-// for( i++ ; i < this.elements.size() ; i++){
-// Element eN = this.elements.get(i);
-// eQueue.add(eN);
-// if(']' == eN.type || '}' == eN.type){
-// break;
-// }
-// }
-// //翻译RangeQuery
-// Query rangeQuery = this.toTermRangeQuery(e , eQueue);
-// this.querys.push(rangeQuery);
-// }else{
-// throw new IllegalStateException("表达式异常:匹配值丢失");
-// }
-//
-// }else if('(' == e.type){
-// this.operates.push(e);
-//
-// }else if(')' == e.type){
-// boolean doPop = true;
-// while(doPop && !this.operates.empty()){
-// Element op = this.operates.pop();
-// if('(' == op.type){
-// doPop = false;
-// }else {
-// Query q = toBooleanQuery(op);
-// this.querys.push(q);
-// }
-//
-// }
-// }else{
-//
-// if(this.operates.isEmpty()){
-// this.operates.push(e);
-// }else{
-// boolean doPeek = true;
-// while(doPeek && !this.operates.isEmpty()){
-// Element eleOnTop = this.operates.peek();
-// if('(' == eleOnTop.type){
-// doPeek = false;
-// this.operates.push(e);
-// }else if(compare(e , eleOnTop) == 1){
-// this.operates.push(e);
-// doPeek = false;
-// }else if(compare(e , eleOnTop) == 0){
-// Query q = toBooleanQuery(eleOnTop);
-// this.operates.pop();
-// this.querys.push(q);
-// }else{
-// Query q = toBooleanQuery(eleOnTop);
-// this.operates.pop();
-// this.querys.push(q);
-// }
-// }
-//
-// if(doPeek && this.operates.empty()){
-// this.operates.push(e);
-// }
-// }
-// }
-// }
-//
-// while(!this.operates.isEmpty()){
-// Element eleOnTop = this.operates.pop();
-// Query q = toBooleanQuery(eleOnTop);
-// this.querys.push(q);
-// }
-// }
-//
-// /**
-// * 根据逻辑操作符,生成BooleanQuery
-// * @param op
-// * @return
-// */
-// private Query toBooleanQuery(Element op){
-// if(this.querys.size() == 0){
-// return null;
-// }
-//
-// BooleanQuery resultQuery = new BooleanQuery();
-//
-// if(this.querys.size() == 1){
-// return this.querys.get(0);
-// }
-//
-// Query q2 = this.querys.pop();
-// Query q1 = this.querys.pop();
-// if('&' == op.type){
-// if(q1 != null){
-// if(q1 instanceof BooleanQuery){
-// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
-// if(clauses.length > 0
-// && clauses[0].getOccur() == Occur.MUST){
-// for(BooleanClause c : clauses){
-// resultQuery.add(c);
-// }
-// }else{
-// resultQuery.add(q1,Occur.MUST);
-// }
-//
-// }else{
-// //q1 instanceof TermQuery
-// //q1 instanceof TermRangeQuery
-// //q1 instanceof PhraseQuery
-// //others
-// resultQuery.add(q1,Occur.MUST);
-// }
-// }
-//
-// if(q2 != null){
-// if(q2 instanceof BooleanQuery){
-// BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
-// if(clauses.length > 0
-// && clauses[0].getOccur() == Occur.MUST){
-// for(BooleanClause c : clauses){
-// resultQuery.add(c);
-// }
-// }else{
-// resultQuery.add(q2,Occur.MUST);
-// }
-//
-// }else{
-// //q1 instanceof TermQuery
-// //q1 instanceof TermRangeQuery
-// //q1 instanceof PhraseQuery
-// //others
-// resultQuery.add(q2,Occur.MUST);
-// }
-// }
-//
-// }else if('|' == op.type){
-// if(q1 != null){
-// if(q1 instanceof BooleanQuery){
-// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
-// if(clauses.length > 0
-// && clauses[0].getOccur() == Occur.SHOULD){
-// for(BooleanClause c : clauses){
-// resultQuery.add(c);
-// }
-// }else{
-// resultQuery.add(q1,Occur.SHOULD);
-// }
-//
-// }else{
-// //q1 instanceof TermQuery
-// //q1 instanceof TermRangeQuery
-// //q1 instanceof PhraseQuery
-// //others
-// resultQuery.add(q1,Occur.SHOULD);
-// }
-// }
-//
-// if(q2 != null){
-// if(q2 instanceof BooleanQuery){
-// BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
-// if(clauses.length > 0
-// && clauses[0].getOccur() == Occur.SHOULD){
-// for(BooleanClause c : clauses){
-// resultQuery.add(c);
-// }
-// }else{
-// resultQuery.add(q2,Occur.SHOULD);
-// }
-// }else{
-// //q2 instanceof TermQuery
-// //q2 instanceof TermRangeQuery
-// //q2 instanceof PhraseQuery
-// //others
-// resultQuery.add(q2,Occur.SHOULD);
-//
-// }
-// }
-//
-// }else if('-' == op.type){
-// if(q1 == null || q2 == null){
-// throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
-// }
-//
-// if(q1 instanceof BooleanQuery){
-// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
-// if(clauses.length > 0){
-// for(BooleanClause c : clauses){
-// resultQuery.add(c);
-// }
-// }else{
-// resultQuery.add(q1,Occur.MUST);
-// }
-//
-// }else{
-// //q1 instanceof TermQuery
-// //q1 instanceof TermRangeQuery
-// //q1 instanceof PhraseQuery
-// //others
-// resultQuery.add(q1,Occur.MUST);
-// }
-//
-// resultQuery.add(q2,Occur.MUST_NOT);
-// }
-// return resultQuery;
-// }
-//
-// /**
-// * 组装TermRangeQuery
-// * @param elements
-// * @return
-// */
-// private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){
-//
-// boolean includeFirst = false;
-// boolean includeLast = false;
-// String firstValue = null;
-// String lastValue = null;
-// //检查第一个元素是否是[或者{
-// Element first = elements.getFirst();
-// if('[' == first.type){
-// includeFirst = true;
-// }else if('{' == first.type){
-// includeFirst = false;
-// }else {
-// throw new IllegalStateException("表达式异常");
-// }
-// //检查最后一个元素是否是]或者}
-// Element last = elements.getLast();
-// if(']' == last.type){
-// includeLast = true;
-// }else if('}' == last.type){
-// includeLast = false;
-// }else {
-// throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
-// }
-// if(elements.size() < 4 || elements.size() > 5){
-// throw new IllegalStateException("表达式异常, RangeQuery 错误");
-// }
-// //读出中间部分
-// Element e2 = elements.get(1);
-// if('\'' == e2.type){
-// firstValue = e2.toString();
-// //
-// Element e3 = elements.get(2);
-// if(',' != e3.type){
-// throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
-// }
-// //
-// Element e4 = elements.get(3);
-// if('\'' == e4.type){
-// lastValue = e4.toString();
-// }else if(e4 != last){
-// throw new IllegalStateException("表达式异常,RangeQuery格式错误");
-// }
-// }else if(',' == e2.type){
-// firstValue = null;
-// //
-// Element e3 = elements.get(2);
-// if('\'' == e3.type){
-// lastValue = e3.toString();
-// }else{
-// throw new IllegalStateException("表达式异常,RangeQuery格式错误");
-// }
-//
-// }else {
-// throw new IllegalStateException("表达式异常, RangeQuery格式错误");
-// }
-//
-// return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);
-// }
-//
-// /**
-// * 比较操作符优先级
-// * @param e1
-// * @param e2
-// * @return
-// */
-// private int compare(Element e1 , Element e2){
-// if('&' == e1.type){
-// if('&' == e2.type){
-// return 0;
-// }else {
-// return 1;
-// }
-// }else if('|' == e1.type){
-// if('&' == e2.type){
-// return -1;
-// }else if('|' == e2.type){
-// return 0;
-// }else{
-// return 1;
-// }
-// }else{
-// if('-' == e2.type){
-// return 0;
-// }else{
-// return -1;
-// }
-// }
-// }
-//
-// /**
-// * 表达式元素(操作符、FieldName、FieldValue)
-// * @author linliangyi
-// * May 20, 2010
-// */
-// private class Element{
-// char type = 0;
-// StringBuffer eleTextBuff;
-//
-// public Element(){
-// eleTextBuff = new StringBuffer();
-// }
-//
-// public void append(char c){
-// this.eleTextBuff.append(c);
-// }
-//
-// public String toString(){
-// return this.eleTextBuff.toString();
-// }
-// }
-//
-// public static void main(String[] args){
-// IKQueryExpressionParser parser = new IKQueryExpressionParser();
-// //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
-// String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
-// Query result = parser.parseExp(ikQueryExp , true);
-// System.out.println(result);
-//
-// }
-//
-//}
+/**
+ * IK 中文分词 版本 5.0
+ * IK Analyzer release 5.0
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ *
+ */
+package org.wltea.analyzer.query;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Stack;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * IK简易查询表达式解析
+ * 结合SWMCQuery算法
+ *
+ * 表达式例子 :
+ * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
+ * @author linliangyi
+ *
+ */
+public class IKQueryExpressionParser {
+
+ //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
+
+ private List elements = new ArrayList();
+
+ private Stack querys = new Stack();
+
+ private Stack operates = new Stack();
+
+ /**
+ * 解析查询表达式,生成Lucene Query对象
+ *
+ * @param expression
+ * @param quickMode
+ * @return Lucene query
+ */
+ public Query parseExp(String expression , boolean quickMode){
+ Query lucenceQuery = null;
+ if(expression != null && !"".equals(expression.trim())){
+ try{
+ //文法解析
+ this.splitElements(expression);
+ //语法解析
+ this.parseSyntax(quickMode);
+ if(this.querys.size() == 1){
+ lucenceQuery = this.querys.pop();
+ }else{
+ throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
+ }
+ }finally{
+ elements.clear();
+ querys.clear();
+ operates.clear();
+ }
+ }
+ return lucenceQuery;
+ }
+
+ /**
+ * 表达式文法解析
+ * @param expression
+ */
+ private void splitElements(String expression){
+
+ if(expression == null){
+ return;
+ }
+ Element curretElement = null;
+
+ char[] expChars = expression.toCharArray();
+ for(int i = 0 ; i < expChars.length ; i++){
+ switch(expChars[i]){
+ case '&' :
+ if(curretElement == null){
+ curretElement = new Element();
+ curretElement.type = '&';
+ curretElement.append(expChars[i]);
+ }else if(curretElement.type == '&'){
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ }else if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ }else {
+ this.elements.add(curretElement);
+ curretElement = new Element();
+ curretElement.type = '&';
+ curretElement.append(expChars[i]);
+ }
+ break;
+
+ case '|' :
+ if(curretElement == null){
+ curretElement = new Element();
+ curretElement.type = '|';
+ curretElement.append(expChars[i]);
+ }else if(curretElement.type == '|'){
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ }else if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ }else {
+ this.elements.add(curretElement);
+ curretElement = new Element();
+ curretElement.type = '|';
+ curretElement.append(expChars[i]);
+ }
+ break;
+
+ case '-' :
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = '-';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case '(' :
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = '(';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case ')' :
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = ')';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case ':' :
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = ':';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case '=' :
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = '=';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case ' ' :
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ }else{
+ this.elements.add(curretElement);
+ curretElement = null;
+ }
+ }
+
+ break;
+
+ case '\'' :
+ if(curretElement == null){
+ curretElement = new Element();
+ curretElement.type = '\'';
+
+ }else if(curretElement.type == '\''){
+ this.elements.add(curretElement);
+ curretElement = null;
+
+ }else{
+ this.elements.add(curretElement);
+ curretElement = new Element();
+ curretElement.type = '\'';
+
+ }
+ break;
+
+ case '[':
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = '[';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case ']':
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = ']';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+
+ break;
+
+ case '{':
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = '{';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+ break;
+
+ case '}':
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = '}';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+
+ break;
+ case ',':
+ if(curretElement != null){
+ if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+ continue;
+ }else{
+ this.elements.add(curretElement);
+ }
+ }
+ curretElement = new Element();
+ curretElement.type = ',';
+ curretElement.append(expChars[i]);
+ this.elements.add(curretElement);
+ curretElement = null;
+
+ break;
+
+ default :
+ if(curretElement == null){
+ curretElement = new Element();
+ curretElement.type = 'F';
+ curretElement.append(expChars[i]);
+
+ }else if(curretElement.type == 'F'){
+ curretElement.append(expChars[i]);
+
+ }else if(curretElement.type == '\''){
+ curretElement.append(expChars[i]);
+
+ }else{
+ this.elements.add(curretElement);
+ curretElement = new Element();
+ curretElement.type = 'F';
+ curretElement.append(expChars[i]);
+ }
+ }
+ }
+
+ if(curretElement != null){
+ this.elements.add(curretElement);
+ curretElement = null;
+ }
+ }
+
+ /**
+ * 语法解析
+ *
+ */
+ private void parseSyntax(boolean quickMode){
+ for(int i = 0 ; i < this.elements.size() ; i++){
+ Element e = this.elements.get(i);
+ if('F' == e.type){
+ Element e2 = this.elements.get(i + 1);
+ if('=' != e2.type && ':' != e2.type){
+ throw new IllegalStateException("表达式异常: = 或 : 号丢失");
+ }
+ Element e3 = this.elements.get(i + 2);
+ //处理 = 和 : 运算
+ if('\'' == e3.type){
+ i+=2;
+ if('=' == e2.type){
+ TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
+ this.querys.push(tQuery);
+ }else if(':' == e2.type){
+ String keyword = e3.toString();
+ //SWMCQuery Here
+ Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode);
+ this.querys.push(_SWMCQuery);
+ }
+
+ }else if('[' == e3.type || '{' == e3.type){
+ i+=2;
+ //处理 [] 和 {}
+ LinkedList eQueue = new LinkedList();
+ eQueue.add(e3);
+ for( i++ ; i < this.elements.size() ; i++){
+ Element eN = this.elements.get(i);
+ eQueue.add(eN);
+ if(']' == eN.type || '}' == eN.type){
+ break;
+ }
+ }
+ //翻译RangeQuery
+ Query rangeQuery = this.toTermRangeQuery(e , eQueue);
+ this.querys.push(rangeQuery);
+ }else{
+ throw new IllegalStateException("表达式异常:匹配值丢失");
+ }
+
+ }else if('(' == e.type){
+ this.operates.push(e);
+
+ }else if(')' == e.type){
+ boolean doPop = true;
+ while(doPop && !this.operates.empty()){
+ Element op = this.operates.pop();
+ if('(' == op.type){
+ doPop = false;
+ }else {
+ Query q = toBooleanQuery(op);
+ this.querys.push(q);
+ }
+
+ }
+ }else{
+
+ if(this.operates.isEmpty()){
+ this.operates.push(e);
+ }else{
+ boolean doPeek = true;
+ while(doPeek && !this.operates.isEmpty()){
+ Element eleOnTop = this.operates.peek();
+ if('(' == eleOnTop.type){
+ doPeek = false;
+ this.operates.push(e);
+ }else if(compare(e , eleOnTop) == 1){
+ this.operates.push(e);
+ doPeek = false;
+ }else if(compare(e , eleOnTop) == 0){
+ Query q = toBooleanQuery(eleOnTop);
+ this.operates.pop();
+ this.querys.push(q);
+ }else{
+ Query q = toBooleanQuery(eleOnTop);
+ this.operates.pop();
+ this.querys.push(q);
+ }
+ }
+
+ if(doPeek && this.operates.empty()){
+ this.operates.push(e);
+ }
+ }
+ }
+ }
+
+ while(!this.operates.isEmpty()){
+ Element eleOnTop = this.operates.pop();
+ Query q = toBooleanQuery(eleOnTop);
+ this.querys.push(q);
+ }
+ }
+
+ /**
+ * 根据逻辑操作符,生成BooleanQuery
+ * @param op
+ * @return
+ */
+ private Query toBooleanQuery(Element op){
+ if(this.querys.size() == 0){
+ return null;
+ }
+
+ BooleanQuery resultQuery = new BooleanQuery();
+
+ if(this.querys.size() == 1){
+ return this.querys.get(0);
+ }
+
+ Query q2 = this.querys.pop();
+ Query q1 = this.querys.pop();
+ if('&' == op.type){
+ if(q1 != null){
+ if(q1 instanceof BooleanQuery){
+ BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
+ if(clauses.length > 0
+ && clauses[0].getOccur() == Occur.MUST){
+ for(BooleanClause c : clauses){
+ resultQuery.add(c);
+ }
+ }else{
+ resultQuery.add(q1,Occur.MUST);
+ }
+
+ }else{
+ //q1 instanceof TermQuery
+ //q1 instanceof TermRangeQuery
+ //q1 instanceof PhraseQuery
+ //others
+ resultQuery.add(q1,Occur.MUST);
+ }
+ }
+
+ if(q2 != null){
+ if(q2 instanceof BooleanQuery){
+ BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
+ if(clauses.length > 0
+ && clauses[0].getOccur() == Occur.MUST){
+ for(BooleanClause c : clauses){
+ resultQuery.add(c);
+ }
+ }else{
+ resultQuery.add(q2,Occur.MUST);
+ }
+
+ }else{
+ //q1 instanceof TermQuery
+ //q1 instanceof TermRangeQuery
+ //q1 instanceof PhraseQuery
+ //others
+ resultQuery.add(q2,Occur.MUST);
+ }
+ }
+
+ }else if('|' == op.type){
+ if(q1 != null){
+ if(q1 instanceof BooleanQuery){
+ BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
+ if(clauses.length > 0
+ && clauses[0].getOccur() == Occur.SHOULD){
+ for(BooleanClause c : clauses){
+ resultQuery.add(c);
+ }
+ }else{
+ resultQuery.add(q1,Occur.SHOULD);
+ }
+
+ }else{
+ //q1 instanceof TermQuery
+ //q1 instanceof TermRangeQuery
+ //q1 instanceof PhraseQuery
+ //others
+ resultQuery.add(q1,Occur.SHOULD);
+ }
+ }
+
+ if(q2 != null){
+ if(q2 instanceof BooleanQuery){
+ BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
+ if(clauses.length > 0
+ && clauses[0].getOccur() == Occur.SHOULD){
+ for(BooleanClause c : clauses){
+ resultQuery.add(c);
+ }
+ }else{
+ resultQuery.add(q2,Occur.SHOULD);
+ }
+ }else{
+ //q2 instanceof TermQuery
+ //q2 instanceof TermRangeQuery
+ //q2 instanceof PhraseQuery
+ //others
+ resultQuery.add(q2,Occur.SHOULD);
+
+ }
+ }
+
+ }else if('-' == op.type){
+ if(q1 == null || q2 == null){
+ throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
+ }
+
+ if(q1 instanceof BooleanQuery){
+ BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
+ if(clauses.length > 0){
+ for(BooleanClause c : clauses){
+ resultQuery.add(c);
+ }
+ }else{
+ resultQuery.add(q1,Occur.MUST);
+ }
+
+ }else{
+ //q1 instanceof TermQuery
+ //q1 instanceof TermRangeQuery
+ //q1 instanceof PhraseQuery
+ //others
+ resultQuery.add(q1,Occur.MUST);
+ }
+
+ resultQuery.add(q2,Occur.MUST_NOT);
+ }
+ return resultQuery;
+ }
+
+ /**
+ * 组装TermRangeQuery
+ * @param elements
+ * @return
+ */
+ private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){
+
+ boolean includeFirst = false;
+ boolean includeLast = false;
+ String firstValue = null;
+ String lastValue = null;
+ //检查第一个元素是否是[或者{
+ Element first = elements.getFirst();
+ if('[' == first.type){
+ includeFirst = true;
+ }else if('{' == first.type){
+ includeFirst = false;
+ }else {
+ throw new IllegalStateException("表达式异常");
+ }
+ //检查最后一个元素是否是]或者}
+ Element last = elements.getLast();
+ if(']' == last.type){
+ includeLast = true;
+ }else if('}' == last.type){
+ includeLast = false;
+ }else {
+ throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
+ }
+ if(elements.size() < 4 || elements.size() > 5){
+ throw new IllegalStateException("表达式异常, RangeQuery 错误");
+ }
+ //读出中间部分
+ Element e2 = elements.get(1);
+ if('\'' == e2.type){
+ firstValue = e2.toString();
+ //
+ Element e3 = elements.get(2);
+ if(',' != e3.type){
+ throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
+ }
+ //
+ Element e4 = elements.get(3);
+ if('\'' == e4.type){
+ lastValue = e4.toString();
+ }else if(e4 != last){
+ throw new IllegalStateException("表达式异常,RangeQuery格式错误");
+ }
+ }else if(',' == e2.type){
+ firstValue = null;
+ //
+ Element e3 = elements.get(2);
+ if('\'' == e3.type){
+ lastValue = e3.toString();
+ }else{
+ throw new IllegalStateException("表达式异常,RangeQuery格式错误");
+ }
+
+ }else {
+ throw new IllegalStateException("表达式异常, RangeQuery格式错误");
+ }
+
+ return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast);
+ }
+
+ /**
+ * 比较操作符优先级
+ * @param e1
+ * @param e2
+ * @return
+ */
+ private int compare(Element e1 , Element e2){
+ if('&' == e1.type){
+ if('&' == e2.type){
+ return 0;
+ }else {
+ return 1;
+ }
+ }else if('|' == e1.type){
+ if('&' == e2.type){
+ return -1;
+ }else if('|' == e2.type){
+ return 0;
+ }else{
+ return 1;
+ }
+ }else{
+ if('-' == e2.type){
+ return 0;
+ }else{
+ return -1;
+ }
+ }
+ }
+
+ /**
+ * 表达式元素(操作符、FieldName、FieldValue)
+ * @author linliangyi
+ * May 20, 2010
+ */
+ private class Element{
+ char type = 0;
+ StringBuffer eleTextBuff;
+
+ public Element(){
+ eleTextBuff = new StringBuffer();
+ }
+
+ public void append(char c){
+ this.eleTextBuff.append(c);
+ }
+
+ public String toString(){
+ return this.eleTextBuff.toString();
+ }
+ }
+
+ public static void main(String[] args){
+ IKQueryExpressionParser parser = new IKQueryExpressionParser();
+ //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
+ String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
+ Query result = parser.parseExp(ikQueryExp , true);
+ System.out.println(result);
+
+ }
+
+}
diff --git a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java
index 1c3bd42..e8c00d1 100644
--- a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java
+++ b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java
@@ -1,153 +1,153 @@
-///**
-// * IK 中文分词 版本 5.0
-// * IK Analyzer release 5.0
-// *
-// * Licensed to the Apache Software Foundation (ASF) under one or more
-// * contributor license agreements. See the NOTICE file distributed with
-// * this work for additional information regarding copyright ownership.
-// * The ASF licenses this file to You under the Apache License, Version 2.0
-// * (the "License"); you may not use this file except in compliance with
-// * the License. You may obtain a copy of the License at
-// *
-// * http://www.apache.org/licenses/LICENSE-2.0
-// *
-// * Unless required by applicable law or agreed to in writing, software
-// * distributed under the License is distributed on an "AS IS" BASIS,
-// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// * See the License for the specific language governing permissions and
-// * limitations under the License.
-// *
-// * 源代码由林良益(linliangyi2005@gmail.com)提供
-// * 版权声明 2012,乌龙茶工作室
-// * provided by Linliangyi and copyright 2012 by Oolong studio
-// *
-// */
-//package org.wltea.analyzer.query;
-//
-//import java.io.IOException;
-//import java.io.StringReader;
-//import java.util.ArrayList;
-//import java.util.List;
-//
-//import org.apache.lucene.analysis.standard.StandardAnalyzer;
-//import org.apache.lucene.queryparser.classic.ParseException;
-//import org.apache.lucene.queryparser.classic.QueryParser;
-//import org.apache.lucene.search.Query;
-//import org.apache.lucene.util.Version;
-//import org.wltea.analyzer.core.IKSegmenter;
-//import org.wltea.analyzer.core.Lexeme;
-//
-///**
-// * Single Word Multi Char Query Builder
-// * IK分词算法专用
-// * @author linliangyi
-// *
-// */
-//public class SWMCQueryBuilder {
-//
-// /**
-// * 生成SWMCQuery
-// * @param fieldName
-// * @param keywords
-// * @param quickMode
-// * @return Lucene Query
-// */
-// public static Query create(String fieldName ,String keywords , boolean quickMode){
-// if(fieldName == null || keywords == null){
-// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
-// }
-// //1.对keywords进行分词处理
-// List lexemes = doAnalyze(keywords);
-// //2.根据分词结果,生成SWMCQuery
-// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
-// return _SWMCQuery;
-// }
-//
-// /**
-// * 分词切分,并返回结链表
-// * @param keywords
-// * @return
-// */
-// private static List doAnalyze(String keywords){
-// List lexemes = new ArrayList();
-// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
-// try{
-// Lexeme l = null;
-// while( (l = ikSeg.next()) != null){
-// lexemes.add(l);
-// }
-// }catch(IOException e){
-// e.printStackTrace();
-// }
-// return lexemes;
-// }
-//
-//
-// /**
-// * 根据分词结果生成SWMC搜索
-// * @param fieldName
+/**
+ * IK 中文分词 版本 5.0
+ * IK Analyzer release 5.0
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ *
+ */
+package org.wltea.analyzer.query;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.Version;
+import org.wltea.analyzer.core.IKSegmenter;
+import org.wltea.analyzer.core.Lexeme;
+
+/**
+ * Single Word Multi Char Query Builder
+ * IK分词算法专用
+ * @author linliangyi
+ *
+ */
+public class SWMCQueryBuilder {
+
+ /**
+ * 生成SWMCQuery
+ * @param fieldName
+ * @param keywords
+ * @param quickMode
+ * @return Lucene Query
+ */
+ public static Query create(String fieldName ,String keywords , boolean quickMode){
+ if(fieldName == null || keywords == null){
+ throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
+ }
+ //1.对keywords进行分词处理
+ List lexemes = doAnalyze(keywords);
+ //2.根据分词结果,生成SWMCQuery
+ Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
+ return _SWMCQuery;
+ }
+
+ /**
+ * 分词切分,并返回结链表
+ * @param keywords
+ * @return
+ */
+ private static List doAnalyze(String keywords){
+ List lexemes = new ArrayList();
+ IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
+ try{
+ Lexeme l = null;
+ while( (l = ikSeg.next()) != null){
+ lexemes.add(l);
+ }
+ }catch(IOException e){
+ e.printStackTrace();
+ }
+ return lexemes;
+ }
+
+
+ /**
+ * 根据分词结果生成SWMC搜索
+ * @param fieldName
// * @param pathOption
-// * @param quickMode
-// * @return
-// */
-// private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){
-// //构造SWMC的查询表达式
-// StringBuffer keywordBuffer = new StringBuffer();
-// //精简的SWMC的查询表达式
-// StringBuffer keywordBuffer_Short = new StringBuffer();
-// //记录最后词元长度
-// int lastLexemeLength = 0;
-// //记录最后词元结束位置
-// int lastLexemeEnd = -1;
-//
-// int shortCount = 0;
-// int totalCount = 0;
-// for(Lexeme l : lexemes){
-// totalCount += l.getLength();
-// //精简表达式
-// if(l.getLength() > 1){
-// keywordBuffer_Short.append(' ').append(l.getLexemeText());
-// shortCount += l.getLength();
-// }
-//
-// if(lastLexemeLength == 0){
-// keywordBuffer.append(l.getLexemeText());
-// }else if(lastLexemeLength == 1 && l.getLength() == 1
-// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
-// keywordBuffer.append(l.getLexemeText());
-// }else{
-// keywordBuffer.append(' ').append(l.getLexemeText());
-//
-// }
-// lastLexemeLength = l.getLength();
-// lastLexemeEnd = l.getEndPosition();
-// }
-//
-// //借助lucene queryparser 生成SWMC Query
-// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
-// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
-// qp.setAutoGeneratePhraseQueries(true);
-//
-// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
-// try {
-// //System.out.println(keywordBuffer.toString());
-// Query q = qp.parse(keywordBuffer_Short.toString());
-// return q;
-// } catch (ParseException e) {
-// e.printStackTrace();
-// }
-//
-// }else{
-// if(keywordBuffer.length() > 0){
-// try {
-// //System.out.println(keywordBuffer.toString());
-// Query q = qp.parse(keywordBuffer.toString());
-// return q;
-// } catch (ParseException e) {
-// e.printStackTrace();
-// }
-// }
-// }
-// return null;
-// }
-//}
+ * @param quickMode
+ * @return
+ */
+ private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){
+ //构造SWMC的查询表达式
+ StringBuffer keywordBuffer = new StringBuffer();
+ //精简的SWMC的查询表达式
+ StringBuffer keywordBuffer_Short = new StringBuffer();
+ //记录最后词元长度
+ int lastLexemeLength = 0;
+ //记录最后词元结束位置
+ int lastLexemeEnd = -1;
+
+ int shortCount = 0;
+ int totalCount = 0;
+ for(Lexeme l : lexemes){
+ totalCount += l.getLength();
+ //精简表达式
+ if(l.getLength() > 1){
+ keywordBuffer_Short.append(' ').append(l.getLexemeText());
+ shortCount += l.getLength();
+ }
+
+ if(lastLexemeLength == 0){
+ keywordBuffer.append(l.getLexemeText());
+ }else if(lastLexemeLength == 1 && l.getLength() == 1
+ && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
+ keywordBuffer.append(l.getLexemeText());
+ }else{
+ keywordBuffer.append(' ').append(l.getLexemeText());
+
+ }
+ lastLexemeLength = l.getLength();
+ lastLexemeEnd = l.getEndPosition();
+ }
+
+ //借助lucene queryparser 生成SWMC Query
+ QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
+ qp.setDefaultOperator(QueryParser.AND_OPERATOR);
+ qp.setAutoGeneratePhraseQueries(true);
+
+ if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
+ try {
+ //System.out.println(keywordBuffer.toString());
+ Query q = qp.parse(keywordBuffer_Short.toString());
+ return q;
+ } catch (ParseException e) {
+ e.printStackTrace();
+ }
+
+ }else{
+ if(keywordBuffer.length() > 0){
+ try {
+ //System.out.println(keywordBuffer.toString());
+ Query q = qp.parse(keywordBuffer.toString());
+ return q;
+ } catch (ParseException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ return null;
+ }
+}
diff --git a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java
index e6a9e9f..32a998d 100644
--- a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java
+++ b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java
@@ -1,147 +1,147 @@
-///**
-// * IK 中文分词 版本 5.0
-// * IK Analyzer release 5.0
-// *
-// * Licensed to the Apache Software Foundation (ASF) under one or more
-// * contributor license agreements. See the NOTICE file distributed with
-// * this work for additional information regarding copyright ownership.
-// * The ASF licenses this file to You under the Apache License, Version 2.0
-// * (the "License"); you may not use this file except in compliance with
-// * the License. You may obtain a copy of the License at
-// *
-// * http://www.apache.org/licenses/LICENSE-2.0
-// *
-// * Unless required by applicable law or agreed to in writing, software
-// * distributed under the License is distributed on an "AS IS" BASIS,
-// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// * See the License for the specific language governing permissions and
-// * limitations under the License.
-// *
-// * 源代码由林良益(linliangyi2005@gmail.com)提供
-// * 版权声明 2012,乌龙茶工作室
-// * provided by Linliangyi and copyright 2012 by Oolong studio
-// *
-// *
-// */
-//package org.wltea.analyzer.sample;
-//
-//import java.io.IOException;
-//
-//import org.apache.lucene.analysis.Analyzer;
-//import org.apache.lucene.document.Document;
-//import org.apache.lucene.document.Field;
-//import org.apache.lucene.document.StringField;
-//import org.apache.lucene.document.TextField;
-//import org.apache.lucene.index.CorruptIndexException;
-//import org.apache.lucene.index.DirectoryReader;
-//import org.apache.lucene.index.IndexReader;
-//import org.apache.lucene.index.IndexWriter;
-//import org.apache.lucene.index.IndexWriterConfig;
-//import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-//import org.apache.lucene.queryparser.classic.ParseException;
-//import org.apache.lucene.queryparser.classic.QueryParser;
-//import org.apache.lucene.search.IndexSearcher;
-//import org.apache.lucene.search.Query;
-//import org.apache.lucene.search.ScoreDoc;
-//import org.apache.lucene.search.TopDocs;
-//import org.apache.lucene.store.Directory;
-//import org.apache.lucene.store.LockObtainFailedException;
-//import org.apache.lucene.store.RAMDirectory;
-//import org.apache.lucene.util.Version;
-//import org.wltea.analyzer.lucene.IKAnalyzer;
-//
-//
-//
-//
-///**
-// * 使用IKAnalyzer进行Lucene索引和查询的演示
-// * 2012-3-2
-// *
-// * 以下是结合Lucene4.0 API的写法
-// *
-// */
-//public class LuceneIndexAndSearchDemo {
-//
-//
-// /**
-// * 模拟:
-// * 创建一个单条记录的索引,并对其进行搜索
-// * @param args
-// */
-// public static void main(String[] args){
-// //Lucene Document的域名
-// String fieldName = "text";
-// //检索内容
-// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
-//
-// //实例化IKAnalyzer分词器
-// Analyzer analyzer = new IKAnalyzer(true);
-//
-// Directory directory = null;
-// IndexWriter iwriter = null;
-// IndexReader ireader = null;
-// IndexSearcher isearcher = null;
-// try {
-// //建立内存索引对象
-// directory = new RAMDirectory();
-//
-// //配置IndexWriterConfig
-// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
-// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
-// iwriter = new IndexWriter(directory , iwConfig);
-// //写入索引
-// Document doc = new Document();
-// doc.add(new StringField("ID", "10000", Field.Store.YES));
-// doc.add(new TextField(fieldName, text, Field.Store.YES));
-// iwriter.addDocument(doc);
-// iwriter.close();
-//
-//
-// //搜索过程**********************************
-// //实例化搜索器
-// ireader = DirectoryReader.open(directory);
-// isearcher = new IndexSearcher(ireader);
-//
-// String keyword = "中文分词工具包";
-// //使用QueryParser查询分析器构造Query对象
-// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
-// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
-// Query query = qp.parse(keyword);
-// System.out.println("Query = " + query);
-//
-// //搜索相似度最高的5条记录
-// TopDocs topDocs = isearcher.search(query , 5);
-// System.out.println("命中:" + topDocs.totalHits);
-// //输出结果
-// ScoreDoc[] scoreDocs = topDocs.scoreDocs;
-// for (int i = 0; i < topDocs.totalHits; i++){
-// Document targetDoc = isearcher.doc(scoreDocs[i].doc);
-// System.out.println("内容:" + targetDoc.toString());
-// }
-//
-// } catch (CorruptIndexException e) {
-// e.printStackTrace();
-// } catch (LockObtainFailedException e) {
-// e.printStackTrace();
-// } catch (IOException e) {
-// e.printStackTrace();
-// } catch (ParseException e) {
-// e.printStackTrace();
-// } finally{
-// if(ireader != null){
-// try {
-// ireader.close();
-// } catch (IOException e) {
-// e.printStackTrace();
-// }
-// }
-// if(directory != null){
-// try {
-// directory.close();
-// } catch (IOException e) {
-// e.printStackTrace();
-// }
-// }
-// }
-// }
-//}
+/**
+ * IK 中文分词 版本 5.0
+ * IK Analyzer release 5.0
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ *
+ *
+ */
+package org.wltea.analyzer.sample;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.wltea.analyzer.lucene.IKAnalyzer;
+
+
+
+
+/**
+ * 使用IKAnalyzer进行Lucene索引和查询的演示
+ * 2012-3-2
+ *
+ * 以下是结合Lucene4.0 API的写法
+ *
+ */
+public class LuceneIndexAndSearchDemo {
+
+
+ /**
+ * 模拟:
+ * 创建一个单条记录的索引,并对其进行搜索
+ * @param args
+ */
+ public static void main(String[] args){
+ //Lucene Document的域名
+ String fieldName = "text";
+ //检索内容
+ String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
+
+ //实例化IKAnalyzer分词器
+ Analyzer analyzer = new IKAnalyzer(true);
+
+ Directory directory = null;
+ IndexWriter iwriter = null;
+ IndexReader ireader = null;
+ IndexSearcher isearcher = null;
+ try {
+ //建立内存索引对象
+ directory = new RAMDirectory();
+
+ //配置IndexWriterConfig
+ IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
+ iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
+ iwriter = new IndexWriter(directory , iwConfig);
+ //写入索引
+ Document doc = new Document();
+ doc.add(new StringField("ID", "10000", Field.Store.YES));
+ doc.add(new TextField(fieldName, text, Field.Store.YES));
+ iwriter.addDocument(doc);
+ iwriter.close();
+
+
+ //搜索过程**********************************
+ //实例化搜索器
+ ireader = DirectoryReader.open(directory);
+ isearcher = new IndexSearcher(ireader);
+
+ String keyword = "中文分词工具包";
+ //使用QueryParser查询分析器构造Query对象
+ QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
+ qp.setDefaultOperator(QueryParser.AND_OPERATOR);
+ Query query = qp.parse(keyword);
+ System.out.println("Query = " + query);
+
+ //搜索相似度最高的5条记录
+ TopDocs topDocs = isearcher.search(query , 5);
+ System.out.println("命中:" + topDocs.totalHits);
+ //输出结果
+ ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+ for (int i = 0; i < topDocs.totalHits; i++){
+ Document targetDoc = isearcher.doc(scoreDocs[i].doc);
+ System.out.println("内容:" + targetDoc.toString());
+ }
+
+ } catch (CorruptIndexException e) {
+ e.printStackTrace();
+ } catch (LockObtainFailedException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (ParseException e) {
+ e.printStackTrace();
+ } finally{
+ if(ireader != null){
+ try {
+ ireader.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ if(directory != null){
+ try {
+ directory.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+}