From 5e14e3d629c6753b50ef5ef788ebd383739ece1f Mon Sep 17 00:00:00 2001 From: wangweihua Date: Thu, 9 May 2013 13:46:25 +0800 Subject: [PATCH] elasticsearch ik 0.20.x => 0.90.x --- pom.xml | 4 +- .../index/analysis/IkAnalyzer.java | 21 +- .../wltea/analyzer/core/AnalyzeContext.java | 22 +- .../org/wltea/analyzer/core/CJKSegmenter.java | 10 +- .../analyzer/core/CN_QuantifierSegmenter.java | 10 +- .../org/wltea/analyzer/core/IKArbitrator.java | 7 +- .../org/wltea/analyzer/core/IKSegmenter.java | 34 +- .../wltea/analyzer/core/LetterSegmenter.java | 2 +- .../org/wltea/analyzer/dic/DictSegment.java | 10 +- .../org/wltea/analyzer/dic/Dictionary.java | 705 ++++---- src/main/java/org/wltea/analyzer/dic/Hit.java | 12 +- .../org/wltea/analyzer/lucene/IKAnalyzer.java | 84 +- .../wltea/analyzer/lucene/IKTokenizer.java | 149 +- .../query/IKQueryExpressionParser.java | 1432 ++++++++--------- .../analyzer/query/SWMCQueryBuilder.java | 304 ++-- .../sample/LuceneIndexAndSearchDemo.java | 294 ++-- 16 files changed, 1520 insertions(+), 1580 deletions(-) diff --git a/pom.xml b/pom.xml index 401f248..8dcf0d0 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ - 0.20.2 + 0.90.0 @@ -132,4 +132,4 @@ - \ No newline at end of file + diff --git a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java index 404e8e3..ffdf0d6 100644 --- a/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java @@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.wltea.analyzer.lucene.IKTokenizer; +//import org.wltea.lucene.IKTokenizer; import java.io.Reader; public class IkAnalyzer extends Analyzer { - - @Override public TokenStream tokenStream(String fieldName, Reader reader) { - return new IKTokenizer(reader,true); - } +// private boolean isMaxWordLength = false; +// @Override public TokenStream tokenStream(String fieldName, Reader reader) { +// return new IKTokenizer(reader,true); +// } public IkAnalyzer() { super(); } + + @Override + protected TokenStreamComponents createComponents(String s, Reader reader) { +// new TokenStreamComponents + Tokenizer tokenizer = new IKTokenizer(reader, true); + return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates. + } + +// public boolean isMaxWordLength() { +// return isMaxWordLength; +// } } diff --git a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java index 288ec40..7ca75a9 100644 --- a/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java +++ b/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java @@ -24,11 +24,16 @@ */ package org.wltea.analyzer.core; -import org.wltea.analyzer.dic.Dictionary; - import java.io.IOException; import java.io.Reader; -import java.util.*; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Map; +import java.util.Set; + +import org.wltea.analyzer.cfg.Configuration; +import org.wltea.analyzer.dic.Dictionary; /** * @@ -68,12 +73,12 @@ class AnalyzeContext { private Map pathMap; //最终分词结果集 private LinkedList results; - + private boolean useSmart; //分词器配置项 - private boolean useSmart; - +// private Configuration cfg; + public AnalyzeContext(boolean useSmart){ - this.useSmart = useSmart; + this.useSmart = useSmart; this.segmentBuff = new char[BUFF_SIZE]; this.charTypes = new int[BUFF_SIZE]; this.buffLocker = new HashSet(); @@ -313,7 +318,7 @@ class AnalyzeContext { while(result != null){ //数量词合并 this.compound(result); - if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ + if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ //是停止词继续取列表的下一个 result = this.results.pollFirst(); }else{ @@ -344,6 +349,7 @@ class AnalyzeContext { * 组合词元 */ private void compound(Lexeme result){ + if(!this.useSmart){ return ; } diff --git a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java index 86b1c8c..5867ff5 100644 --- a/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java @@ -25,12 +25,12 @@ */ package org.wltea.analyzer.core; -import org.wltea.analyzer.dic.Dictionary; -import org.wltea.analyzer.dic.Hit; - import java.util.LinkedList; import java.util.List; +import org.wltea.analyzer.dic.Dictionary; +import org.wltea.analyzer.dic.Hit; + /** * 中文-日韩文子分词器 @@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter { //处理词段队列 Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); @@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter { //********************************* //再对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); diff --git a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java index 50ed33a..b987a3a 100644 --- a/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java @@ -24,14 +24,14 @@ */ package org.wltea.analyzer.core; -import org.wltea.analyzer.dic.Dictionary; -import org.wltea.analyzer.dic.Hit; - import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; +import org.wltea.analyzer.dic.Dictionary; +import org.wltea.analyzer.dic.Hit; + /** * * 中文数量词子分词器 @@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{ //处理词段队列 Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); for(Hit hit : tmpArray){ - hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); + hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); if(hit.isMatch()){ //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); @@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{ //********************************* //对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); + Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); if(singleCharHit.isMatch()){//首字成量词词 //输出当前的词 Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); diff --git a/src/main/java/org/wltea/analyzer/core/IKArbitrator.java b/src/main/java/org/wltea/analyzer/core/IKArbitrator.java index e15647b..18af1bd 100644 --- a/src/main/java/org/wltea/analyzer/core/IKArbitrator.java +++ b/src/main/java/org/wltea/analyzer/core/IKArbitrator.java @@ -38,7 +38,7 @@ class IKArbitrator { /** * 分词歧义处理 - * @param orgLexemes +// * @param orgLexemes * @param useSmart */ void process(AnalyzeContext context , boolean useSmart){ @@ -87,7 +87,6 @@ class IKArbitrator { * 歧义识别 * @param lexemeCell 歧义路径链表头 * @param fullTextLength 歧义路径文本长度 - * @param option 候选结果路径 * @return */ private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ @@ -120,7 +119,7 @@ class IKArbitrator { /** * 向前遍历,添加词元,构造一个无歧义词元组合 - * @param LexemePath path +// * @param LexemePath path * @return */ private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ @@ -140,7 +139,7 @@ class IKArbitrator { /** * 回滚词元链,直到它能够接受指定的词元 - * @param lexeme +// * @param lexeme * @param l */ private void backPath(Lexeme l , LexemePath option){ diff --git a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java index 3548192..aa20452 100644 --- a/src/main/java/org/wltea/analyzer/core/IKSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/IKSegmenter.java @@ -23,14 +23,15 @@ */ package org.wltea.analyzer.core; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; - import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; +import org.wltea.analyzer.cfg.Configuration; +//import org.wltea.analyzer.cfg.DefaultConfig; +import org.wltea.analyzer.dic.Dictionary; + /** * IK分词器主类 * @@ -39,16 +40,18 @@ public final class IKSegmenter { //字符窜reader private Reader input; + //分词器配置项 + private Configuration cfg; //分词器上下文 private AnalyzeContext context; //分词处理器列表 private List segmenters; //分词歧义裁决器 private IKArbitrator arbitrator; - private ESLogger logger=null; - private final boolean useSmart; + private boolean useSmart = false; + - /** + /** * IK分词器构造函数 * @param input * @param useSmart 为true,使用智能分词策略 @@ -57,16 +60,31 @@ public final class IKSegmenter { * 智能分词: 合并数词和量词,对分词结果进行歧义判断 */ public IKSegmenter(Reader input , boolean useSmart){ - logger = Loggers.getLogger("ik-analyzer"); this.input = input; +// this.cfg = DefaultConfig.getInstance(); this.useSmart=useSmart; - this.init(); + this.init(); + } + + /** + * IK分词器构造函数 + * @param input + * @param cfg 使用自定义的Configuration构造分词器 + * + */ + public IKSegmenter(Reader input , Configuration cfg){ + this.input = input; + this.cfg = cfg; + this.init(); } /** * 初始化 */ private void init(){ + //初始化词典单例 +// Dictionary.initial(this.cfg); +// Dictionary.getSingleton(); //初始化分词上下文 this.context = new AnalyzeContext(useSmart); //加载子分词器 diff --git a/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java b/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java index feb7f36..d239e91 100644 --- a/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java +++ b/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java @@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter { /** * 处理数字字母混合输出 * 如:windos2000 | linliangyi2005@gmail.com - * @param input +// * @param input * @param context * @return */ diff --git a/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/src/main/java/org/wltea/analyzer/dic/DictSegment.java index ecbcd9c..c34c5e2 100644 --- a/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -326,13 +326,5 @@ class DictSegment implements Comparable{ //对当前节点存储的char进行比较 return this.nodeChar.compareTo(o.nodeChar); } - - public int getDicNum(){ - if(charMap!=null) - { - return charMap.size(); - } - return 0; - } - + } diff --git a/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/src/main/java/org/wltea/analyzer/dic/Dictionary.java index c02c5b7..36ea8e3 100644 --- a/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -1,74 +1,233 @@ /** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * * */ package org.wltea.analyzer.dic; +import java.io.*; +import java.util.Collection; +import java.util.List; + import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.wltea.analyzer.cfg.Configuration; -import java.io.*; -import java.util.Collection; -import java.util.List; - +/** + * 词典管理类,单子模式 + */ public class Dictionary { - public static final String PATH_DIC_MAIN = "ik/main.dic"; - public static final String PATH_DIC_SURNAME = "ik/surname.dic"; - public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic"; - public static final String PATH_DIC_SUFFIX = "ik/suffix.dic"; - public static final String PATH_DIC_PREP = "ik/preposition.dic"; - public static final String PATH_DIC_STOP = "ik/stopword.dic"; - private static final Dictionary singleton; - - static{ - singleton = new Dictionary(); - } + /* + * 词典单子实例 + */ + private static Dictionary singleton; + + /* + * 主词典对象 + */ private DictSegment _MainDict; - - private DictSegment _SurnameDict; - + + /* + * 停止词词典 + */ + private DictSegment _StopWordDict; + /* + * 量词词典 + */ private DictSegment _QuantifierDict; - - private DictSegment _SuffixDict; - - private DictSegment _PrepDict; - - private DictSegment _StopWords; - - private Environment environment; - private Configuration configuration; + + /** + * 配置对象 + */ + private Configuration configuration; private ESLogger logger=null; private static boolean dictInited=false; - private Dictionary(){ + private Environment environment; + public static final String PATH_DIC_MAIN = "ik/main.dic"; + public static final String PATH_DIC_SURNAME = "ik/surname.dic"; + public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic"; + public static final String PATH_DIC_SUFFIX = "ik/suffix.dic"; + public static final String PATH_DIC_PREP = "ik/preposition.dic"; + public static final String PATH_DIC_STOP = "ik/stopword.dic"; + private Dictionary(){ logger = Loggers.getLogger("ik-analyzer"); - } - - public Configuration getConfig(){ - return configuration; - } + } + static{ + singleton = new Dictionary(); + } +// public Configuration getConfig(){ +// return configuration; +// } +// private Dictionary(Configuration cfg){ +// this.cfg = cfg; +// this.loadMainDict(); +// this.loadStopWordDict(); +// this.loadQuantifierDict(); +// } public void Init(Settings indexSettings){ - if(!dictInited){ - environment =new Environment(indexSettings); - configuration=new Configuration(indexSettings); - loadMainDict(); - loadSurnameDict(); - loadQuantifierDict(); - loadSuffixDict(); - loadPrepDict(); - loadStopWordDict(); - dictInited=true; - } + if(!dictInited){ + environment =new Environment(indexSettings); + configuration=new Configuration(indexSettings); + loadMainDict(); +// loadSurnameDict(); + loadQuantifierDict(); +// loadSuffixDict(); +// loadPrepDict(); + loadStopWordDict(); + dictInited=true; + } } + /** + * 词典初始化 + * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 + * 只有当Dictionary类被实际调用时,才会开始载入词典, + * 这将延长首次分词操作的时间 + * 该方法提供了一个在应用加载阶段就初始化字典的手段 + * @return Dictionary + */ +// public static Dictionary initial(Configuration cfg){ +// if(singleton == null){ +// synchronized(Dictionary.class){ +// if(singleton == null){ +// singleton = new Dictionary(); +// return singleton; +// } +// } +// } +// return singleton; +// } + + /** + * 获取词典单子实例 + * @return Dictionary 单例对象 + */ + public static Dictionary getSingleton(){ + if(singleton == null){ + throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); + } + return singleton; + } + + /** + * 批量加载新词条 + * @param words Collection词条列表 + */ + public void addWords(Collection words){ + if(words != null){ + for(String word : words){ + if (word != null) { + //批量加载词条到主内存词典中 + singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); + } + } + } + } + + /** + * 批量移除(屏蔽)词条 + * @param words + */ + public void disableWords(Collection words){ + if(words != null){ + for(String word : words){ + if (word != null) { + //批量屏蔽词条 + singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); + } + } + } + } + + /** + * 检索匹配主词典 + * @param charArray + * @return Hit 匹配结果描述 + */ + public Hit matchInMainDict(char[] charArray){ + return singleton._MainDict.match(charArray); + } + + /** + * 检索匹配主词典 + * @param charArray + * @param begin + * @param length + * @return Hit 匹配结果描述 + */ + public Hit matchInMainDict(char[] charArray , int begin, int length){ + return singleton._MainDict.match(charArray, begin, length); + } + + /** + * 检索匹配量词词典 + * @param charArray + * @param begin + * @param length + * @return Hit 匹配结果描述 + */ + public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ + return singleton._QuantifierDict.match(charArray, begin, length); + } + + + /** + * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 + * @param charArray + * @param currentIndex + * @param matchedHit + * @return Hit + */ + public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){ + DictSegment ds = matchedHit.getMatchedDictSegment(); + return ds.match(charArray, currentIndex, 1 , matchedHit); + } + + + /** + * 判断是否是停止词 + * @param charArray + * @param begin + * @param length + * @return boolean + */ + public boolean isStopWord(char[] charArray , int begin, int length){ + return singleton._StopWordDict.match(charArray, begin, length).isMatch(); + } + + /** + * 加载主词典及扩展词典 + */ private void loadMainDict(){ + //建立一个主词典实例 _MainDict = new DictSegment((char)0); - + //读取主词典文件 File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN); InputStream is = null; @@ -77,24 +236,21 @@ public class Dictionary { } catch (FileNotFoundException e) { e.printStackTrace(); } - if(is == null){ - throw new RuntimeException("Main Dictionary not found!!!"); - } - + try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; + String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { - _MainDict.fillSegment(theWord.trim().toCharArray()); + _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - logger.info("[Dict Loading] {},MainDict Size:{}",file.toString(),_MainDict.getDicNum()); + } catch (IOException ioe) { System.err.println("Main Dictionary loading exception."); ioe.printStackTrace(); - + }finally{ try { if(is != null){ @@ -105,41 +261,42 @@ public class Dictionary { e.printStackTrace(); } } - - + //加载扩展词典 + this.loadExtDict(); + } + + /** + * 加载用户配置的扩展词典到主词库表 + */ + private void loadExtDict(){ + //加载扩展词典配置 List extDictFiles = configuration.getExtDictionarys(); if(extDictFiles != null){ + InputStream is = null; for(String extDictName : extDictFiles){ - - File tempFile=new File(environment.configFile(),extDictName); - - try { - is = new FileInputStream(tempFile); - } catch (FileNotFoundException e) { - e.printStackTrace(); - logger.error("[Dict Loading]",e); - } - - if(is == null){ + //读取扩展词典文件 + System.out.println("加载扩展词典:" + extDictName); + is = this.getClass().getClassLoader().getResourceAsStream(extDictName); + //如果找不到扩展的字典,则忽略 + if(is == null){ continue; } try { - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; + String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { - - + //加载扩展词典数据到主内存词典中 + //System.out.println(theWord); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - logger.info("[Dict Loading] {},MainDict Size:{}",tempFile.toString(),_MainDict.getDicNum()); + } catch (IOException ioe) { System.err.println("Extension Dictionary loading exception."); ioe.printStackTrace(); - + }finally{ try { if(is != null){ @@ -151,77 +308,85 @@ public class Dictionary { } } } - } + } } - - - private void loadSurnameDict(){ - - _SurnameDict = new DictSegment((char)0); - File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME); - InputStream is = null; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - if(is == null){ - throw new RuntimeException("Surname Dictionary not found!!!"); - } - try { - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _SurnameDict.fillSegment(theWord.trim().toCharArray()); + + /** + * 加载用户扩展的停止词词典 + */ + private void loadStopWordDict(){ + //建立一个主词典实例 + _StopWordDict = new DictSegment((char)0); + //加载扩展停止词典 + List extStopWordDictFiles = configuration.getExtStopWordDictionarys(); + if(extStopWordDictFiles != null){ + InputStream is = null; + for(String extStopWordDictName : extStopWordDictFiles){ + System.out.println("加载扩展停止词典:" + extStopWordDictName); + //读取扩展词典文件 + is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName); + //如果找不到扩展的字典,则忽略 + if(is == null){ + continue; } - } while (theWord != null); - logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum()); - } catch (IOException ioe) { - System.err.println("Surname Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; + try { + BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); + String theWord = null; + do { + theWord = br.readLine(); + if (theWord != null && !"".equals(theWord.trim())) { + //System.out.println(theWord); + //加载扩展停止词典数据到内存中 + _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); + } + } while (theWord != null); + + } catch (IOException ioe) { + System.err.println("Extension Stop word Dictionary loading exception."); + ioe.printStackTrace(); + + }finally{ + try { + if(is != null){ + is.close(); + is = null; + } + } catch (IOException e) { + e.printStackTrace(); + } } - } catch (IOException e) { - e.printStackTrace(); } - } + } } - - + + /** + * 加载量词词典 + */ private void loadQuantifierDict(){ - + //建立一个量词典实例 _QuantifierDict = new DictSegment((char)0); + //读取量词词典文件 File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER); InputStream is = null; try { is = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); - } - if(is == null){ - throw new RuntimeException("Quantifier Dictionary not found!!!"); } try { BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; + String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { - _QuantifierDict.fillSegment(theWord.trim().toCharArray()); + _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - logger.info("[Dict Loading] {},QuantifierDict Size:{}",file.toString(),_QuantifierDict.getDicNum()); + } catch (IOException ioe) { System.err.println("Quantifier Dictionary loading exception."); ioe.printStackTrace(); - + }finally{ try { if(is != null){ @@ -235,304 +400,8 @@ public class Dictionary { } - private void loadSuffixDict(){ - - _SuffixDict = new DictSegment((char)0); - File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX); - InputStream is = null; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - if(is == null){ - throw new RuntimeException("Suffix Dictionary not found!!!"); - } - try { - - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _SuffixDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum()); - } catch (IOException ioe) { - System.err.println("Suffix Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - - private void loadPrepDict(){ - - _PrepDict = new DictSegment((char)0); - File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP); - InputStream is = null; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - if(is == null){ - throw new RuntimeException("Preposition Dictionary not found!!!"); - } - try { - - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - - _PrepDict.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum()); - } catch (IOException ioe) { - System.err.println("Preposition Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - - private void loadStopWordDict(){ - - _StopWords = new DictSegment((char)0); - File file=new File(environment.configFile(),Dictionary.PATH_DIC_STOP); - InputStream is = null; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - if(is == null){ - throw new RuntimeException("Stopword Dictionary not found!!!"); - } - try { - - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - _StopWords.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - logger.info("[Dict Loading] {},Stopwords Size:{}",file.toString(),_StopWords.getDicNum()); - } catch (IOException ioe) { - System.err.println("Stopword Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - - List extStopWordDictFiles = configuration.getExtStopWordDictionarys(); - if(extStopWordDictFiles != null){ - for(String extStopWordDictName : extStopWordDictFiles){ - File tempFile=new File(environment.configFile(),extStopWordDictName); - try { - is = new FileInputStream(tempFile); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - - if(is == null){ - continue; - } - try { - - BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); - String theWord; - do { - theWord = br.readLine(); - if (theWord != null && !"".equals(theWord.trim())) { - - - _StopWords.fillSegment(theWord.trim().toCharArray()); - } - } while (theWord != null); - logger.info("[Dict Loading] {},Stopwords Size:{}",tempFile.toString(),_StopWords.getDicNum()); - } catch (IOException ioe) { - System.err.println("Extension Stop word Dictionary loading exception."); - ioe.printStackTrace(); - - }finally{ - try { - if(is != null){ - is.close(); - is = null; - } - } catch (IOException e) { - e.printStackTrace(); - } - } - } - } - - } - - public static Dictionary getInstance(){ - return Dictionary.singleton; - } - - public static void loadExtendWords(Collection extWords){ - if(extWords != null){ - for(String extWord : extWords){ - if (extWord != null) { - - singleton._MainDict.fillSegment(extWord.trim().toCharArray()); - } - } - } - } - - - public static void loadExtendStopWords(Collection extStopWords){ - if(extStopWords != null){ - for(String extStopWord : extStopWords){ - if (extStopWord != null) { - - singleton._StopWords.fillSegment(extStopWord.trim().toCharArray()); - } - } - } - } - - - public static Hit matchInMainDict(char[] charArray){ - return singleton._MainDict.match(charArray); - } - - - public static Hit matchInMainDict(char[] charArray , int begin, int length){ - return singleton._MainDict.match(charArray, begin, length); - } - - - public static Hit matchInMainDictWithHit(char[] charArray , int currentIndex , Hit matchedHit){ - DictSegment ds = matchedHit.getMatchedDictSegment(); - return ds.match(charArray, currentIndex, 1 , matchedHit); - } - + public static Dictionary getInstance(){ + return Dictionary.singleton; + } - public static Hit matchInSurnameDict(char[] charArray , int begin, int length){ - return singleton._SurnameDict.match(charArray, begin, length); - } - - - - - - - - - - - - - - - - - - - - - - /** - * 检索匹配量词词典 - * @param charArray - * @param begin - * @param length - * @return Hit 匹配结果描述 - */ - public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){ - return singleton._QuantifierDict.match(charArray, begin, length); - } - - /** - * 检索匹配在后缀词典 - * @param charArray - * @param begin - * @param length - * @return Hit 匹配结果描述 - */ - public static Hit matchInSuffixDict(char[] charArray , int begin, int length){ - return singleton._SuffixDict.match(charArray, begin, length); - } - - - - - - - - - - - - - - - - - - - - - - - /** - * 检索匹配介词、副词词典 - * @param charArray - * @param begin - * @param length - * @return Hit 匹配结果描述 - */ - public static Hit matchInPrepDict(char[] charArray , int begin, int length){ - return singleton._PrepDict.match(charArray, begin, length); - } - - /** - * 判断是否是停止词 - * @param charArray - * @param begin - * @param length - * @return boolean - */ - public static boolean isStopWord(char[] charArray , int begin, int length){ - return singleton._StopWords.match(charArray, begin, length).isMatch(); - } } diff --git a/src/main/java/org/wltea/analyzer/dic/Hit.java b/src/main/java/org/wltea/analyzer/dic/Hit.java index b5110bd..cdfd0e5 100644 --- a/src/main/java/org/wltea/analyzer/dic/Hit.java +++ b/src/main/java/org/wltea/analyzer/dic/Hit.java @@ -58,7 +58,9 @@ public class Hit { public boolean isMatch() { return (this.hitState & MATCH) > 0; } - + /** + * + */ public void setMatch() { this.hitState = this.hitState | MATCH; } @@ -69,7 +71,9 @@ public class Hit { public boolean isPrefix() { return (this.hitState & PREFIX) > 0; } - + /** + * + */ public void setPrefix() { this.hitState = this.hitState | PREFIX; } @@ -79,7 +83,9 @@ public class Hit { public boolean isUnmatch() { return this.hitState == UNMATCH ; } - + /** + * + */ public void setUnmatch() { this.hitState = UNMATCH; } diff --git a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java index 665954d..1dd15d5 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java @@ -1,51 +1,87 @@ /** + * IK 中文分词 版本 5.0.1 + * IK Analyzer release 5.0.1 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio * */ package org.wltea.analyzer.lucene; +import java.io.Reader; + import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.elasticsearch.common.settings.Settings; import org.wltea.analyzer.dic.Dictionary; -import java.io.Reader; - -public final class IKAnalyzer extends Analyzer { +/** + * IK分词器,Lucene Analyzer接口实现 + * 兼容Lucene 4.0版本 + */ +public final class IKAnalyzer extends Analyzer{ - private boolean isMaxWordLength = false; - private boolean useSmart=false; + private boolean useSmart; + + public boolean useSmart() { + return useSmart; + } - public IKAnalyzer(){ + public void setUseSmart(boolean useSmart) { + this.useSmart = useSmart; + } + + /** + * IK分词器Lucene Analyzer接口实现类 + * + * 默认细粒度切分算法 + */ + public IKAnalyzer(){ this(false); } - - public IKAnalyzer(boolean isMaxWordLength){ + /** + * IK分词器Lucene Analyzer接口实现类 + * + * @param useSmart 当为true时,分词器进行智能切分 + */ + public IKAnalyzer(boolean useSmart){ super(); - this.setMaxWordLength(isMaxWordLength); + this.useSmart = useSmart; } public IKAnalyzer(Settings indexSetting,Settings settings1) { super(); - Dictionary.getInstance().Init(indexSetting); + Dictionary.getInstance().Init(indexSetting); if(settings1.get("use_smart", "true").equals("true")){ - useSmart=true; + useSmart = true; } } - - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new IKTokenizer(reader , useSmart); - } - - public void setMaxWordLength(boolean isMaxWordLength) { - this.isMaxWordLength = isMaxWordLength; - } - - public boolean isMaxWordLength() { - return isMaxWordLength; + /** + * 重载Analyzer接口,构造分词组件 + */ + @Override + protected TokenStreamComponents createComponents(String fieldName, final Reader in) { + Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart()); + return new TokenStreamComponents(_IKTokenizer); } } diff --git a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java index ffd5f02..846e4f1 100644 --- a/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java +++ b/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java @@ -1,7 +1,7 @@ /** * IK 中文分词 版本 5.0.1 * IK Analyzer release 5.0.1 - * + * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -20,94 +20,95 @@ * 源代码由林良益(linliangyi2005@gmail.com)提供 * 版权声明 2012,乌龙茶工作室 * provided by Linliangyi and copyright 2012 by Oolong studio - * + * - * + * */ package org.wltea.analyzer.lucene; +import java.io.IOException; +import java.io.Reader; + import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; -import java.io.IOException; -import java.io.Reader; - /** * IK分词器 Lucene Tokenizer适配器类 * 兼容Lucene 4.0版本 */ public final class IKTokenizer extends Tokenizer { + + //IK分词器实现 + private IKSegmenter _IKImplement; + + //词元文本属性 + private final CharTermAttribute termAtt; + //词元位移属性 + private final OffsetAttribute offsetAtt; + //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) + private final TypeAttribute typeAtt; + //记录最后一个词元的结束位置 + private int endPosition; + + /** + * Lucene 4.0 Tokenizer适配器类构造函数 + * @param in + * @param useSmart + */ + public IKTokenizer(Reader in , boolean useSmart){ + super(in); + offsetAtt = addAttribute(OffsetAttribute.class); + termAtt = addAttribute(CharTermAttribute.class); + typeAtt = addAttribute(TypeAttribute.class); + _IKImplement = new IKSegmenter(input , useSmart); + } - //IK分词器实现 - private IKSegmenter _IKImplement; - - //词元文本属性 - private final CharTermAttribute termAtt; - //词元位移属性 - private final OffsetAttribute offsetAtt; - //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) - private final TypeAttribute typeAtt; - //记录最后一个词元的结束位置 - private int endPosition; - - /** - * Lucene 4.0 Tokenizer适配器类构造函数 - * @param in - * @param useSmart - */ - public IKTokenizer(Reader in , boolean useSmart){ - super(in); - offsetAtt = addAttribute(OffsetAttribute.class); - termAtt = addAttribute(CharTermAttribute.class); - typeAtt = addAttribute(TypeAttribute.class); - _IKImplement = new IKSegmenter(input , useSmart); - } - - /* (non-Javadoc) - * @see org.apache.lucene.analysis.TokenStream#incrementToken() - */ - @Override - public boolean incrementToken() throws IOException { - //清除所有的词元属性 - clearAttributes(); - Lexeme nextLexeme = _IKImplement.next(); - if(nextLexeme != null){ - //将Lexeme转成Attributes - //设置词元文本 - termAtt.append(nextLexeme.getLexemeText()); - //设置词元长度 - termAtt.setLength(nextLexeme.getLength()); - //设置词元位移 - offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); - //记录分词的最后位置 - endPosition = nextLexeme.getEndPosition(); - //记录词元分类 - typeAtt.setType(nextLexeme.getLexemeTypeString()); - //返会true告知还有下个词元 - return true; - } - //返会false告知词元输出完毕 - return false; - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) - */ - @Override - public void reset() throws IOException { - super.reset(); - _IKImplement.reset(input); - } - - @Override - public final void end() { - // set final offset - int finalOffset = correctOffset(this.endPosition); - offsetAtt.setOffset(finalOffset, finalOffset); - } + /* (non-Javadoc) + * @see org.apache.lucene.analysis.TokenStream#incrementToken() + */ + @Override + public boolean incrementToken() throws IOException { + //清除所有的词元属性 + clearAttributes(); + Lexeme nextLexeme = _IKImplement.next(); + if(nextLexeme != null){ + //将Lexeme转成Attributes + //设置词元文本 + termAtt.append(nextLexeme.getLexemeText()); + //设置词元长度 + termAtt.setLength(nextLexeme.getLength()); + //设置词元位移 + offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); + //记录分词的最后位置 + endPosition = nextLexeme.getEndPosition(); + //记录词元分类 + typeAtt.setType(nextLexeme.getLexemeTypeString()); + //返会true告知还有下个词元 + return true; + } + //返会false告知词元输出完毕 + return false; + } + + /* + * (non-Javadoc) + * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) + */ + @Override + public void reset() throws IOException { + super.reset(); + _IKImplement.reset(input); + } + + @Override + public final void end() { + // set final offset + int finalOffset = correctOffset(this.endPosition); + offsetAtt.setOffset(finalOffset, finalOffset); + } } diff --git a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java index 63b730b..1b86a35 100644 --- a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java +++ b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java @@ -1,716 +1,716 @@ -///** -// * IK 中文分词 版本 5.0 -// * IK Analyzer release 5.0 -// * -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// * -// * 源代码由林良益(linliangyi2005@gmail.com)提供 -// * 版权声明 2012,乌龙茶工作室 -// * provided by Linliangyi and copyright 2012 by Oolong studio -// * -// */ -//package org.wltea.analyzer.query; -// -//import java.util.ArrayList; -//import java.util.LinkedList; -//import java.util.List; -//import java.util.Stack; -// -//import org.apache.lucene.index.Term; -//import org.apache.lucene.search.BooleanClause; -//import org.apache.lucene.search.BooleanQuery; -//import org.apache.lucene.search.Query; -//import org.apache.lucene.search.TermQuery; -//import org.apache.lucene.search.TermRangeQuery; -//import org.apache.lucene.search.BooleanClause.Occur; -//import org.apache.lucene.util.BytesRef; -// -///** -// * IK简易查询表达式解析 -// * 结合SWMCQuery算法 -// * -// * 表达式例子 : -// * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword' -// * @author linliangyi -// * -// */ -//public class IKQueryExpressionParser { -// -// //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],"; -// -// private List elements = new ArrayList(); -// -// private Stack querys = new Stack(); -// -// private Stack operates = new Stack(); -// -// /** -// * 解析查询表达式,生成Lucene Query对象 -// * -// * @param expression -// * @param quickMode -// * @return Lucene query -// */ -// public Query parseExp(String expression , boolean quickMode){ -// Query lucenceQuery = null; -// if(expression != null && !"".equals(expression.trim())){ -// try{ -// //文法解析 -// this.splitElements(expression); -// //语法解析 -// this.parseSyntax(quickMode); -// if(this.querys.size() == 1){ -// lucenceQuery = this.querys.pop(); -// }else{ -// throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失"); -// } -// }finally{ -// elements.clear(); -// querys.clear(); -// operates.clear(); -// } -// } -// return lucenceQuery; -// } -// -// /** -// * 表达式文法解析 -// * @param expression -// */ -// private void splitElements(String expression){ -// -// if(expression == null){ -// return; -// } -// Element curretElement = null; -// -// char[] expChars = expression.toCharArray(); -// for(int i = 0 ; i < expChars.length ; i++){ -// switch(expChars[i]){ -// case '&' : -// if(curretElement == null){ -// curretElement = new Element(); -// curretElement.type = '&'; -// curretElement.append(expChars[i]); -// }else if(curretElement.type == '&'){ -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// }else if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// }else { -// this.elements.add(curretElement); -// curretElement = new Element(); -// curretElement.type = '&'; -// curretElement.append(expChars[i]); -// } -// break; -// -// case '|' : -// if(curretElement == null){ -// curretElement = new Element(); -// curretElement.type = '|'; -// curretElement.append(expChars[i]); -// }else if(curretElement.type == '|'){ -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// }else if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// }else { -// this.elements.add(curretElement); -// curretElement = new Element(); -// curretElement.type = '|'; -// curretElement.append(expChars[i]); -// } -// break; -// -// case '-' : -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = '-'; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case '(' : -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = '('; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case ')' : -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = ')'; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case ':' : -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = ':'; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case '=' : -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = '='; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case ' ' : -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// }else{ -// this.elements.add(curretElement); -// curretElement = null; -// } -// } -// -// break; -// -// case '\'' : -// if(curretElement == null){ -// curretElement = new Element(); -// curretElement.type = '\''; -// -// }else if(curretElement.type == '\''){ -// this.elements.add(curretElement); -// curretElement = null; -// -// }else{ -// this.elements.add(curretElement); -// curretElement = new Element(); -// curretElement.type = '\''; -// -// } -// break; -// -// case '[': -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = '['; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case ']': -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = ']'; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// -// break; -// -// case '{': -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = '{'; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// break; -// -// case '}': -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = '}'; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// -// break; -// case ',': -// if(curretElement != null){ -// if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// continue; -// }else{ -// this.elements.add(curretElement); -// } -// } -// curretElement = new Element(); -// curretElement.type = ','; -// curretElement.append(expChars[i]); -// this.elements.add(curretElement); -// curretElement = null; -// -// break; -// -// default : -// if(curretElement == null){ -// curretElement = new Element(); -// curretElement.type = 'F'; -// curretElement.append(expChars[i]); -// -// }else if(curretElement.type == 'F'){ -// curretElement.append(expChars[i]); -// -// }else if(curretElement.type == '\''){ -// curretElement.append(expChars[i]); -// -// }else{ -// this.elements.add(curretElement); -// curretElement = new Element(); -// curretElement.type = 'F'; -// curretElement.append(expChars[i]); -// } -// } -// } -// -// if(curretElement != null){ -// this.elements.add(curretElement); -// curretElement = null; -// } -// } -// -// /** -// * 语法解析 -// * -// */ -// private void parseSyntax(boolean quickMode){ -// for(int i = 0 ; i < this.elements.size() ; i++){ -// Element e = this.elements.get(i); -// if('F' == e.type){ -// Element e2 = this.elements.get(i + 1); -// if('=' != e2.type && ':' != e2.type){ -// throw new IllegalStateException("表达式异常: = 或 : 号丢失"); -// } -// Element e3 = this.elements.get(i + 2); -// //处理 = 和 : 运算 -// if('\'' == e3.type){ -// i+=2; -// if('=' == e2.type){ -// TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString())); -// this.querys.push(tQuery); -// }else if(':' == e2.type){ -// String keyword = e3.toString(); -// //SWMCQuery Here -// Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode); -// this.querys.push(_SWMCQuery); -// } -// -// }else if('[' == e3.type || '{' == e3.type){ -// i+=2; -// //处理 [] 和 {} -// LinkedList eQueue = new LinkedList(); -// eQueue.add(e3); -// for( i++ ; i < this.elements.size() ; i++){ -// Element eN = this.elements.get(i); -// eQueue.add(eN); -// if(']' == eN.type || '}' == eN.type){ -// break; -// } -// } -// //翻译RangeQuery -// Query rangeQuery = this.toTermRangeQuery(e , eQueue); -// this.querys.push(rangeQuery); -// }else{ -// throw new IllegalStateException("表达式异常:匹配值丢失"); -// } -// -// }else if('(' == e.type){ -// this.operates.push(e); -// -// }else if(')' == e.type){ -// boolean doPop = true; -// while(doPop && !this.operates.empty()){ -// Element op = this.operates.pop(); -// if('(' == op.type){ -// doPop = false; -// }else { -// Query q = toBooleanQuery(op); -// this.querys.push(q); -// } -// -// } -// }else{ -// -// if(this.operates.isEmpty()){ -// this.operates.push(e); -// }else{ -// boolean doPeek = true; -// while(doPeek && !this.operates.isEmpty()){ -// Element eleOnTop = this.operates.peek(); -// if('(' == eleOnTop.type){ -// doPeek = false; -// this.operates.push(e); -// }else if(compare(e , eleOnTop) == 1){ -// this.operates.push(e); -// doPeek = false; -// }else if(compare(e , eleOnTop) == 0){ -// Query q = toBooleanQuery(eleOnTop); -// this.operates.pop(); -// this.querys.push(q); -// }else{ -// Query q = toBooleanQuery(eleOnTop); -// this.operates.pop(); -// this.querys.push(q); -// } -// } -// -// if(doPeek && this.operates.empty()){ -// this.operates.push(e); -// } -// } -// } -// } -// -// while(!this.operates.isEmpty()){ -// Element eleOnTop = this.operates.pop(); -// Query q = toBooleanQuery(eleOnTop); -// this.querys.push(q); -// } -// } -// -// /** -// * 根据逻辑操作符,生成BooleanQuery -// * @param op -// * @return -// */ -// private Query toBooleanQuery(Element op){ -// if(this.querys.size() == 0){ -// return null; -// } -// -// BooleanQuery resultQuery = new BooleanQuery(); -// -// if(this.querys.size() == 1){ -// return this.querys.get(0); -// } -// -// Query q2 = this.querys.pop(); -// Query q1 = this.querys.pop(); -// if('&' == op.type){ -// if(q1 != null){ -// if(q1 instanceof BooleanQuery){ -// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); -// if(clauses.length > 0 -// && clauses[0].getOccur() == Occur.MUST){ -// for(BooleanClause c : clauses){ -// resultQuery.add(c); -// } -// }else{ -// resultQuery.add(q1,Occur.MUST); -// } -// -// }else{ -// //q1 instanceof TermQuery -// //q1 instanceof TermRangeQuery -// //q1 instanceof PhraseQuery -// //others -// resultQuery.add(q1,Occur.MUST); -// } -// } -// -// if(q2 != null){ -// if(q2 instanceof BooleanQuery){ -// BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); -// if(clauses.length > 0 -// && clauses[0].getOccur() == Occur.MUST){ -// for(BooleanClause c : clauses){ -// resultQuery.add(c); -// } -// }else{ -// resultQuery.add(q2,Occur.MUST); -// } -// -// }else{ -// //q1 instanceof TermQuery -// //q1 instanceof TermRangeQuery -// //q1 instanceof PhraseQuery -// //others -// resultQuery.add(q2,Occur.MUST); -// } -// } -// -// }else if('|' == op.type){ -// if(q1 != null){ -// if(q1 instanceof BooleanQuery){ -// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); -// if(clauses.length > 0 -// && clauses[0].getOccur() == Occur.SHOULD){ -// for(BooleanClause c : clauses){ -// resultQuery.add(c); -// } -// }else{ -// resultQuery.add(q1,Occur.SHOULD); -// } -// -// }else{ -// //q1 instanceof TermQuery -// //q1 instanceof TermRangeQuery -// //q1 instanceof PhraseQuery -// //others -// resultQuery.add(q1,Occur.SHOULD); -// } -// } -// -// if(q2 != null){ -// if(q2 instanceof BooleanQuery){ -// BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); -// if(clauses.length > 0 -// && clauses[0].getOccur() == Occur.SHOULD){ -// for(BooleanClause c : clauses){ -// resultQuery.add(c); -// } -// }else{ -// resultQuery.add(q2,Occur.SHOULD); -// } -// }else{ -// //q2 instanceof TermQuery -// //q2 instanceof TermRangeQuery -// //q2 instanceof PhraseQuery -// //others -// resultQuery.add(q2,Occur.SHOULD); -// -// } -// } -// -// }else if('-' == op.type){ -// if(q1 == null || q2 == null){ -// throw new IllegalStateException("表达式异常:SubQuery 个数不匹配"); -// } -// -// if(q1 instanceof BooleanQuery){ -// BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); -// if(clauses.length > 0){ -// for(BooleanClause c : clauses){ -// resultQuery.add(c); -// } -// }else{ -// resultQuery.add(q1,Occur.MUST); -// } -// -// }else{ -// //q1 instanceof TermQuery -// //q1 instanceof TermRangeQuery -// //q1 instanceof PhraseQuery -// //others -// resultQuery.add(q1,Occur.MUST); -// } -// -// resultQuery.add(q2,Occur.MUST_NOT); -// } -// return resultQuery; -// } -// -// /** -// * 组装TermRangeQuery -// * @param elements -// * @return -// */ -// private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){ -// -// boolean includeFirst = false; -// boolean includeLast = false; -// String firstValue = null; -// String lastValue = null; -// //检查第一个元素是否是[或者{ -// Element first = elements.getFirst(); -// if('[' == first.type){ -// includeFirst = true; -// }else if('{' == first.type){ -// includeFirst = false; -// }else { -// throw new IllegalStateException("表达式异常"); -// } -// //检查最后一个元素是否是]或者} -// Element last = elements.getLast(); -// if(']' == last.type){ -// includeLast = true; -// }else if('}' == last.type){ -// includeLast = false; -// }else { -// throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号"); -// } -// if(elements.size() < 4 || elements.size() > 5){ -// throw new IllegalStateException("表达式异常, RangeQuery 错误"); -// } -// //读出中间部分 -// Element e2 = elements.get(1); -// if('\'' == e2.type){ -// firstValue = e2.toString(); -// // -// Element e3 = elements.get(2); -// if(',' != e3.type){ -// throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔"); -// } -// // -// Element e4 = elements.get(3); -// if('\'' == e4.type){ -// lastValue = e4.toString(); -// }else if(e4 != last){ -// throw new IllegalStateException("表达式异常,RangeQuery格式错误"); -// } -// }else if(',' == e2.type){ -// firstValue = null; -// // -// Element e3 = elements.get(2); -// if('\'' == e3.type){ -// lastValue = e3.toString(); -// }else{ -// throw new IllegalStateException("表达式异常,RangeQuery格式错误"); -// } -// -// }else { -// throw new IllegalStateException("表达式异常, RangeQuery格式错误"); -// } -// -// return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast); -// } -// -// /** -// * 比较操作符优先级 -// * @param e1 -// * @param e2 -// * @return -// */ -// private int compare(Element e1 , Element e2){ -// if('&' == e1.type){ -// if('&' == e2.type){ -// return 0; -// }else { -// return 1; -// } -// }else if('|' == e1.type){ -// if('&' == e2.type){ -// return -1; -// }else if('|' == e2.type){ -// return 0; -// }else{ -// return 1; -// } -// }else{ -// if('-' == e2.type){ -// return 0; -// }else{ -// return -1; -// } -// } -// } -// -// /** -// * 表达式元素(操作符、FieldName、FieldValue) -// * @author linliangyi -// * May 20, 2010 -// */ -// private class Element{ -// char type = 0; -// StringBuffer eleTextBuff; -// -// public Element(){ -// eleTextBuff = new StringBuffer(); -// } -// -// public void append(char c){ -// this.eleTextBuff.append(c); -// } -// -// public String toString(){ -// return this.eleTextBuff.toString(); -// } -// } -// -// public static void main(String[] args){ -// IKQueryExpressionParser parser = new IKQueryExpressionParser(); -// //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'"; -// String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; -// Query result = parser.parseExp(ikQueryExp , true); -// System.out.println(result); -// -// } -// -//} +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.query; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Stack; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.util.BytesRef; + +/** + * IK简易查询表达式解析 + * 结合SWMCQuery算法 + * + * 表达式例子 : + * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword' + * @author linliangyi + * + */ +public class IKQueryExpressionParser { + + //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],"; + + private List elements = new ArrayList(); + + private Stack querys = new Stack(); + + private Stack operates = new Stack(); + + /** + * 解析查询表达式,生成Lucene Query对象 + * + * @param expression + * @param quickMode + * @return Lucene query + */ + public Query parseExp(String expression , boolean quickMode){ + Query lucenceQuery = null; + if(expression != null && !"".equals(expression.trim())){ + try{ + //文法解析 + this.splitElements(expression); + //语法解析 + this.parseSyntax(quickMode); + if(this.querys.size() == 1){ + lucenceQuery = this.querys.pop(); + }else{ + throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失"); + } + }finally{ + elements.clear(); + querys.clear(); + operates.clear(); + } + } + return lucenceQuery; + } + + /** + * 表达式文法解析 + * @param expression + */ + private void splitElements(String expression){ + + if(expression == null){ + return; + } + Element curretElement = null; + + char[] expChars = expression.toCharArray(); + for(int i = 0 ; i < expChars.length ; i++){ + switch(expChars[i]){ + case '&' : + if(curretElement == null){ + curretElement = new Element(); + curretElement.type = '&'; + curretElement.append(expChars[i]); + }else if(curretElement.type == '&'){ + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + }else if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + }else { + this.elements.add(curretElement); + curretElement = new Element(); + curretElement.type = '&'; + curretElement.append(expChars[i]); + } + break; + + case '|' : + if(curretElement == null){ + curretElement = new Element(); + curretElement.type = '|'; + curretElement.append(expChars[i]); + }else if(curretElement.type == '|'){ + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + }else if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + }else { + this.elements.add(curretElement); + curretElement = new Element(); + curretElement.type = '|'; + curretElement.append(expChars[i]); + } + break; + + case '-' : + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = '-'; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case '(' : + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = '('; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case ')' : + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = ')'; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case ':' : + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = ':'; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case '=' : + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = '='; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case ' ' : + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + }else{ + this.elements.add(curretElement); + curretElement = null; + } + } + + break; + + case '\'' : + if(curretElement == null){ + curretElement = new Element(); + curretElement.type = '\''; + + }else if(curretElement.type == '\''){ + this.elements.add(curretElement); + curretElement = null; + + }else{ + this.elements.add(curretElement); + curretElement = new Element(); + curretElement.type = '\''; + + } + break; + + case '[': + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = '['; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case ']': + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = ']'; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + + break; + + case '{': + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = '{'; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + break; + + case '}': + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = '}'; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + + break; + case ',': + if(curretElement != null){ + if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + continue; + }else{ + this.elements.add(curretElement); + } + } + curretElement = new Element(); + curretElement.type = ','; + curretElement.append(expChars[i]); + this.elements.add(curretElement); + curretElement = null; + + break; + + default : + if(curretElement == null){ + curretElement = new Element(); + curretElement.type = 'F'; + curretElement.append(expChars[i]); + + }else if(curretElement.type == 'F'){ + curretElement.append(expChars[i]); + + }else if(curretElement.type == '\''){ + curretElement.append(expChars[i]); + + }else{ + this.elements.add(curretElement); + curretElement = new Element(); + curretElement.type = 'F'; + curretElement.append(expChars[i]); + } + } + } + + if(curretElement != null){ + this.elements.add(curretElement); + curretElement = null; + } + } + + /** + * 语法解析 + * + */ + private void parseSyntax(boolean quickMode){ + for(int i = 0 ; i < this.elements.size() ; i++){ + Element e = this.elements.get(i); + if('F' == e.type){ + Element e2 = this.elements.get(i + 1); + if('=' != e2.type && ':' != e2.type){ + throw new IllegalStateException("表达式异常: = 或 : 号丢失"); + } + Element e3 = this.elements.get(i + 2); + //处理 = 和 : 运算 + if('\'' == e3.type){ + i+=2; + if('=' == e2.type){ + TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString())); + this.querys.push(tQuery); + }else if(':' == e2.type){ + String keyword = e3.toString(); + //SWMCQuery Here + Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode); + this.querys.push(_SWMCQuery); + } + + }else if('[' == e3.type || '{' == e3.type){ + i+=2; + //处理 [] 和 {} + LinkedList eQueue = new LinkedList(); + eQueue.add(e3); + for( i++ ; i < this.elements.size() ; i++){ + Element eN = this.elements.get(i); + eQueue.add(eN); + if(']' == eN.type || '}' == eN.type){ + break; + } + } + //翻译RangeQuery + Query rangeQuery = this.toTermRangeQuery(e , eQueue); + this.querys.push(rangeQuery); + }else{ + throw new IllegalStateException("表达式异常:匹配值丢失"); + } + + }else if('(' == e.type){ + this.operates.push(e); + + }else if(')' == e.type){ + boolean doPop = true; + while(doPop && !this.operates.empty()){ + Element op = this.operates.pop(); + if('(' == op.type){ + doPop = false; + }else { + Query q = toBooleanQuery(op); + this.querys.push(q); + } + + } + }else{ + + if(this.operates.isEmpty()){ + this.operates.push(e); + }else{ + boolean doPeek = true; + while(doPeek && !this.operates.isEmpty()){ + Element eleOnTop = this.operates.peek(); + if('(' == eleOnTop.type){ + doPeek = false; + this.operates.push(e); + }else if(compare(e , eleOnTop) == 1){ + this.operates.push(e); + doPeek = false; + }else if(compare(e , eleOnTop) == 0){ + Query q = toBooleanQuery(eleOnTop); + this.operates.pop(); + this.querys.push(q); + }else{ + Query q = toBooleanQuery(eleOnTop); + this.operates.pop(); + this.querys.push(q); + } + } + + if(doPeek && this.operates.empty()){ + this.operates.push(e); + } + } + } + } + + while(!this.operates.isEmpty()){ + Element eleOnTop = this.operates.pop(); + Query q = toBooleanQuery(eleOnTop); + this.querys.push(q); + } + } + + /** + * 根据逻辑操作符,生成BooleanQuery + * @param op + * @return + */ + private Query toBooleanQuery(Element op){ + if(this.querys.size() == 0){ + return null; + } + + BooleanQuery resultQuery = new BooleanQuery(); + + if(this.querys.size() == 1){ + return this.querys.get(0); + } + + Query q2 = this.querys.pop(); + Query q1 = this.querys.pop(); + if('&' == op.type){ + if(q1 != null){ + if(q1 instanceof BooleanQuery){ + BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); + if(clauses.length > 0 + && clauses[0].getOccur() == Occur.MUST){ + for(BooleanClause c : clauses){ + resultQuery.add(c); + } + }else{ + resultQuery.add(q1,Occur.MUST); + } + + }else{ + //q1 instanceof TermQuery + //q1 instanceof TermRangeQuery + //q1 instanceof PhraseQuery + //others + resultQuery.add(q1,Occur.MUST); + } + } + + if(q2 != null){ + if(q2 instanceof BooleanQuery){ + BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); + if(clauses.length > 0 + && clauses[0].getOccur() == Occur.MUST){ + for(BooleanClause c : clauses){ + resultQuery.add(c); + } + }else{ + resultQuery.add(q2,Occur.MUST); + } + + }else{ + //q1 instanceof TermQuery + //q1 instanceof TermRangeQuery + //q1 instanceof PhraseQuery + //others + resultQuery.add(q2,Occur.MUST); + } + } + + }else if('|' == op.type){ + if(q1 != null){ + if(q1 instanceof BooleanQuery){ + BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); + if(clauses.length > 0 + && clauses[0].getOccur() == Occur.SHOULD){ + for(BooleanClause c : clauses){ + resultQuery.add(c); + } + }else{ + resultQuery.add(q1,Occur.SHOULD); + } + + }else{ + //q1 instanceof TermQuery + //q1 instanceof TermRangeQuery + //q1 instanceof PhraseQuery + //others + resultQuery.add(q1,Occur.SHOULD); + } + } + + if(q2 != null){ + if(q2 instanceof BooleanQuery){ + BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); + if(clauses.length > 0 + && clauses[0].getOccur() == Occur.SHOULD){ + for(BooleanClause c : clauses){ + resultQuery.add(c); + } + }else{ + resultQuery.add(q2,Occur.SHOULD); + } + }else{ + //q2 instanceof TermQuery + //q2 instanceof TermRangeQuery + //q2 instanceof PhraseQuery + //others + resultQuery.add(q2,Occur.SHOULD); + + } + } + + }else if('-' == op.type){ + if(q1 == null || q2 == null){ + throw new IllegalStateException("表达式异常:SubQuery 个数不匹配"); + } + + if(q1 instanceof BooleanQuery){ + BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); + if(clauses.length > 0){ + for(BooleanClause c : clauses){ + resultQuery.add(c); + } + }else{ + resultQuery.add(q1,Occur.MUST); + } + + }else{ + //q1 instanceof TermQuery + //q1 instanceof TermRangeQuery + //q1 instanceof PhraseQuery + //others + resultQuery.add(q1,Occur.MUST); + } + + resultQuery.add(q2,Occur.MUST_NOT); + } + return resultQuery; + } + + /** + * 组装TermRangeQuery + * @param elements + * @return + */ + private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){ + + boolean includeFirst = false; + boolean includeLast = false; + String firstValue = null; + String lastValue = null; + //检查第一个元素是否是[或者{ + Element first = elements.getFirst(); + if('[' == first.type){ + includeFirst = true; + }else if('{' == first.type){ + includeFirst = false; + }else { + throw new IllegalStateException("表达式异常"); + } + //检查最后一个元素是否是]或者} + Element last = elements.getLast(); + if(']' == last.type){ + includeLast = true; + }else if('}' == last.type){ + includeLast = false; + }else { + throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号"); + } + if(elements.size() < 4 || elements.size() > 5){ + throw new IllegalStateException("表达式异常, RangeQuery 错误"); + } + //读出中间部分 + Element e2 = elements.get(1); + if('\'' == e2.type){ + firstValue = e2.toString(); + // + Element e3 = elements.get(2); + if(',' != e3.type){ + throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔"); + } + // + Element e4 = elements.get(3); + if('\'' == e4.type){ + lastValue = e4.toString(); + }else if(e4 != last){ + throw new IllegalStateException("表达式异常,RangeQuery格式错误"); + } + }else if(',' == e2.type){ + firstValue = null; + // + Element e3 = elements.get(2); + if('\'' == e3.type){ + lastValue = e3.toString(); + }else{ + throw new IllegalStateException("表达式异常,RangeQuery格式错误"); + } + + }else { + throw new IllegalStateException("表达式异常, RangeQuery格式错误"); + } + + return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast); + } + + /** + * 比较操作符优先级 + * @param e1 + * @param e2 + * @return + */ + private int compare(Element e1 , Element e2){ + if('&' == e1.type){ + if('&' == e2.type){ + return 0; + }else { + return 1; + } + }else if('|' == e1.type){ + if('&' == e2.type){ + return -1; + }else if('|' == e2.type){ + return 0; + }else{ + return 1; + } + }else{ + if('-' == e2.type){ + return 0; + }else{ + return -1; + } + } + } + + /** + * 表达式元素(操作符、FieldName、FieldValue) + * @author linliangyi + * May 20, 2010 + */ + private class Element{ + char type = 0; + StringBuffer eleTextBuff; + + public Element(){ + eleTextBuff = new StringBuffer(); + } + + public void append(char c){ + this.eleTextBuff.append(c); + } + + public String toString(){ + return this.eleTextBuff.toString(); + } + } + + public static void main(String[] args){ + IKQueryExpressionParser parser = new IKQueryExpressionParser(); + //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'"; + String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; + Query result = parser.parseExp(ikQueryExp , true); + System.out.println(result); + + } + +} diff --git a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java index 1c3bd42..e8c00d1 100644 --- a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java +++ b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java @@ -1,153 +1,153 @@ -///** -// * IK 中文分词 版本 5.0 -// * IK Analyzer release 5.0 -// * -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// * -// * 源代码由林良益(linliangyi2005@gmail.com)提供 -// * 版权声明 2012,乌龙茶工作室 -// * provided by Linliangyi and copyright 2012 by Oolong studio -// * -// */ -//package org.wltea.analyzer.query; -// -//import java.io.IOException; -//import java.io.StringReader; -//import java.util.ArrayList; -//import java.util.List; -// -//import org.apache.lucene.analysis.standard.StandardAnalyzer; -//import org.apache.lucene.queryparser.classic.ParseException; -//import org.apache.lucene.queryparser.classic.QueryParser; -//import org.apache.lucene.search.Query; -//import org.apache.lucene.util.Version; -//import org.wltea.analyzer.core.IKSegmenter; -//import org.wltea.analyzer.core.Lexeme; -// -///** -// * Single Word Multi Char Query Builder -// * IK分词算法专用 -// * @author linliangyi -// * -// */ -//public class SWMCQueryBuilder { -// -// /** -// * 生成SWMCQuery -// * @param fieldName -// * @param keywords -// * @param quickMode -// * @return Lucene Query -// */ -// public static Query create(String fieldName ,String keywords , boolean quickMode){ -// if(fieldName == null || keywords == null){ -// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); -// } -// //1.对keywords进行分词处理 -// List lexemes = doAnalyze(keywords); -// //2.根据分词结果,生成SWMCQuery -// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); -// return _SWMCQuery; -// } -// -// /** -// * 分词切分,并返回结链表 -// * @param keywords -// * @return -// */ -// private static List doAnalyze(String keywords){ -// List lexemes = new ArrayList(); -// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); -// try{ -// Lexeme l = null; -// while( (l = ikSeg.next()) != null){ -// lexemes.add(l); -// } -// }catch(IOException e){ -// e.printStackTrace(); -// } -// return lexemes; -// } -// -// -// /** -// * 根据分词结果生成SWMC搜索 -// * @param fieldName +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + */ +package org.wltea.analyzer.query; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.Version; +import org.wltea.analyzer.core.IKSegmenter; +import org.wltea.analyzer.core.Lexeme; + +/** + * Single Word Multi Char Query Builder + * IK分词算法专用 + * @author linliangyi + * + */ +public class SWMCQueryBuilder { + + /** + * 生成SWMCQuery + * @param fieldName + * @param keywords + * @param quickMode + * @return Lucene Query + */ + public static Query create(String fieldName ,String keywords , boolean quickMode){ + if(fieldName == null || keywords == null){ + throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); + } + //1.对keywords进行分词处理 + List lexemes = doAnalyze(keywords); + //2.根据分词结果,生成SWMCQuery + Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); + return _SWMCQuery; + } + + /** + * 分词切分,并返回结链表 + * @param keywords + * @return + */ + private static List doAnalyze(String keywords){ + List lexemes = new ArrayList(); + IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); + try{ + Lexeme l = null; + while( (l = ikSeg.next()) != null){ + lexemes.add(l); + } + }catch(IOException e){ + e.printStackTrace(); + } + return lexemes; + } + + + /** + * 根据分词结果生成SWMC搜索 + * @param fieldName // * @param pathOption -// * @param quickMode -// * @return -// */ -// private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){ -// //构造SWMC的查询表达式 -// StringBuffer keywordBuffer = new StringBuffer(); -// //精简的SWMC的查询表达式 -// StringBuffer keywordBuffer_Short = new StringBuffer(); -// //记录最后词元长度 -// int lastLexemeLength = 0; -// //记录最后词元结束位置 -// int lastLexemeEnd = -1; -// -// int shortCount = 0; -// int totalCount = 0; -// for(Lexeme l : lexemes){ -// totalCount += l.getLength(); -// //精简表达式 -// if(l.getLength() > 1){ -// keywordBuffer_Short.append(' ').append(l.getLexemeText()); -// shortCount += l.getLength(); -// } -// -// if(lastLexemeLength == 0){ -// keywordBuffer.append(l.getLexemeText()); -// }else if(lastLexemeLength == 1 && l.getLength() == 1 -// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) -// keywordBuffer.append(l.getLexemeText()); -// }else{ -// keywordBuffer.append(' ').append(l.getLexemeText()); -// -// } -// lastLexemeLength = l.getLength(); -// lastLexemeEnd = l.getEndPosition(); -// } -// -// //借助lucene queryparser 生成SWMC Query -// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); -// qp.setDefaultOperator(QueryParser.AND_OPERATOR); -// qp.setAutoGeneratePhraseQueries(true); -// -// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ -// try { -// //System.out.println(keywordBuffer.toString()); -// Query q = qp.parse(keywordBuffer_Short.toString()); -// return q; -// } catch (ParseException e) { -// e.printStackTrace(); -// } -// -// }else{ -// if(keywordBuffer.length() > 0){ -// try { -// //System.out.println(keywordBuffer.toString()); -// Query q = qp.parse(keywordBuffer.toString()); -// return q; -// } catch (ParseException e) { -// e.printStackTrace(); -// } -// } -// } -// return null; -// } -//} + * @param quickMode + * @return + */ + private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){ + //构造SWMC的查询表达式 + StringBuffer keywordBuffer = new StringBuffer(); + //精简的SWMC的查询表达式 + StringBuffer keywordBuffer_Short = new StringBuffer(); + //记录最后词元长度 + int lastLexemeLength = 0; + //记录最后词元结束位置 + int lastLexemeEnd = -1; + + int shortCount = 0; + int totalCount = 0; + for(Lexeme l : lexemes){ + totalCount += l.getLength(); + //精简表达式 + if(l.getLength() > 1){ + keywordBuffer_Short.append(' ').append(l.getLexemeText()); + shortCount += l.getLength(); + } + + if(lastLexemeLength == 0){ + keywordBuffer.append(l.getLexemeText()); + }else if(lastLexemeLength == 1 && l.getLength() == 1 + && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) + keywordBuffer.append(l.getLexemeText()); + }else{ + keywordBuffer.append(' ').append(l.getLexemeText()); + + } + lastLexemeLength = l.getLength(); + lastLexemeEnd = l.getEndPosition(); + } + + //借助lucene queryparser 生成SWMC Query + QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); + qp.setDefaultOperator(QueryParser.AND_OPERATOR); + qp.setAutoGeneratePhraseQueries(true); + + if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ + try { + //System.out.println(keywordBuffer.toString()); + Query q = qp.parse(keywordBuffer_Short.toString()); + return q; + } catch (ParseException e) { + e.printStackTrace(); + } + + }else{ + if(keywordBuffer.length() > 0){ + try { + //System.out.println(keywordBuffer.toString()); + Query q = qp.parse(keywordBuffer.toString()); + return q; + } catch (ParseException e) { + e.printStackTrace(); + } + } + } + return null; + } +} diff --git a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java index e6a9e9f..32a998d 100644 --- a/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java +++ b/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java @@ -1,147 +1,147 @@ -///** -// * IK 中文分词 版本 5.0 -// * IK Analyzer release 5.0 -// * -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// * -// * 源代码由林良益(linliangyi2005@gmail.com)提供 -// * 版权声明 2012,乌龙茶工作室 -// * provided by Linliangyi and copyright 2012 by Oolong studio -// * -// * -// */ -//package org.wltea.analyzer.sample; -// -//import java.io.IOException; -// -//import org.apache.lucene.analysis.Analyzer; -//import org.apache.lucene.document.Document; -//import org.apache.lucene.document.Field; -//import org.apache.lucene.document.StringField; -//import org.apache.lucene.document.TextField; -//import org.apache.lucene.index.CorruptIndexException; -//import org.apache.lucene.index.DirectoryReader; -//import org.apache.lucene.index.IndexReader; -//import org.apache.lucene.index.IndexWriter; -//import org.apache.lucene.index.IndexWriterConfig; -//import org.apache.lucene.index.IndexWriterConfig.OpenMode; -//import org.apache.lucene.queryparser.classic.ParseException; -//import org.apache.lucene.queryparser.classic.QueryParser; -//import org.apache.lucene.search.IndexSearcher; -//import org.apache.lucene.search.Query; -//import org.apache.lucene.search.ScoreDoc; -//import org.apache.lucene.search.TopDocs; -//import org.apache.lucene.store.Directory; -//import org.apache.lucene.store.LockObtainFailedException; -//import org.apache.lucene.store.RAMDirectory; -//import org.apache.lucene.util.Version; -//import org.wltea.analyzer.lucene.IKAnalyzer; -// -// -// -// -///** -// * 使用IKAnalyzer进行Lucene索引和查询的演示 -// * 2012-3-2 -// * -// * 以下是结合Lucene4.0 API的写法 -// * -// */ -//public class LuceneIndexAndSearchDemo { -// -// -// /** -// * 模拟: -// * 创建一个单条记录的索引,并对其进行搜索 -// * @param args -// */ -// public static void main(String[] args){ -// //Lucene Document的域名 -// String fieldName = "text"; -// //检索内容 -// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; -// -// //实例化IKAnalyzer分词器 -// Analyzer analyzer = new IKAnalyzer(true); -// -// Directory directory = null; -// IndexWriter iwriter = null; -// IndexReader ireader = null; -// IndexSearcher isearcher = null; -// try { -// //建立内存索引对象 -// directory = new RAMDirectory(); -// -// //配置IndexWriterConfig -// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); -// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); -// iwriter = new IndexWriter(directory , iwConfig); -// //写入索引 -// Document doc = new Document(); -// doc.add(new StringField("ID", "10000", Field.Store.YES)); -// doc.add(new TextField(fieldName, text, Field.Store.YES)); -// iwriter.addDocument(doc); -// iwriter.close(); -// -// -// //搜索过程********************************** -// //实例化搜索器 -// ireader = DirectoryReader.open(directory); -// isearcher = new IndexSearcher(ireader); -// -// String keyword = "中文分词工具包"; -// //使用QueryParser查询分析器构造Query对象 -// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); -// qp.setDefaultOperator(QueryParser.AND_OPERATOR); -// Query query = qp.parse(keyword); -// System.out.println("Query = " + query); -// -// //搜索相似度最高的5条记录 -// TopDocs topDocs = isearcher.search(query , 5); -// System.out.println("命中:" + topDocs.totalHits); -// //输出结果 -// ScoreDoc[] scoreDocs = topDocs.scoreDocs; -// for (int i = 0; i < topDocs.totalHits; i++){ -// Document targetDoc = isearcher.doc(scoreDocs[i].doc); -// System.out.println("内容:" + targetDoc.toString()); -// } -// -// } catch (CorruptIndexException e) { -// e.printStackTrace(); -// } catch (LockObtainFailedException e) { -// e.printStackTrace(); -// } catch (IOException e) { -// e.printStackTrace(); -// } catch (ParseException e) { -// e.printStackTrace(); -// } finally{ -// if(ireader != null){ -// try { -// ireader.close(); -// } catch (IOException e) { -// e.printStackTrace(); -// } -// } -// if(directory != null){ -// try { -// directory.close(); -// } catch (IOException e) { -// e.printStackTrace(); -// } -// } -// } -// } -//} +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + * + */ +package org.wltea.analyzer.sample; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import org.wltea.analyzer.lucene.IKAnalyzer; + + + + +/** + * 使用IKAnalyzer进行Lucene索引和查询的演示 + * 2012-3-2 + * + * 以下是结合Lucene4.0 API的写法 + * + */ +public class LuceneIndexAndSearchDemo { + + + /** + * 模拟: + * 创建一个单条记录的索引,并对其进行搜索 + * @param args + */ + public static void main(String[] args){ + //Lucene Document的域名 + String fieldName = "text"; + //检索内容 + String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; + + //实例化IKAnalyzer分词器 + Analyzer analyzer = new IKAnalyzer(true); + + Directory directory = null; + IndexWriter iwriter = null; + IndexReader ireader = null; + IndexSearcher isearcher = null; + try { + //建立内存索引对象 + directory = new RAMDirectory(); + + //配置IndexWriterConfig + IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); + iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); + iwriter = new IndexWriter(directory , iwConfig); + //写入索引 + Document doc = new Document(); + doc.add(new StringField("ID", "10000", Field.Store.YES)); + doc.add(new TextField(fieldName, text, Field.Store.YES)); + iwriter.addDocument(doc); + iwriter.close(); + + + //搜索过程********************************** + //实例化搜索器 + ireader = DirectoryReader.open(directory); + isearcher = new IndexSearcher(ireader); + + String keyword = "中文分词工具包"; + //使用QueryParser查询分析器构造Query对象 + QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); + qp.setDefaultOperator(QueryParser.AND_OPERATOR); + Query query = qp.parse(keyword); + System.out.println("Query = " + query); + + //搜索相似度最高的5条记录 + TopDocs topDocs = isearcher.search(query , 5); + System.out.println("命中:" + topDocs.totalHits); + //输出结果 + ScoreDoc[] scoreDocs = topDocs.scoreDocs; + for (int i = 0; i < topDocs.totalHits; i++){ + Document targetDoc = isearcher.doc(scoreDocs[i].doc); + System.out.println("内容:" + targetDoc.toString()); + } + + } catch (CorruptIndexException e) { + e.printStackTrace(); + } catch (LockObtainFailedException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } finally{ + if(ireader != null){ + try { + ireader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + if(directory != null){ + try { + directory.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } +}