elasticsearch ik 0.20.x => 0.90.x

This commit is contained in:
wangweihua 2013-05-09 13:46:25 +08:00
parent a2dc3c7842
commit 5e14e3d629
16 changed files with 1520 additions and 1580 deletions

View File

@ -31,7 +31,7 @@
</parent> </parent>
<properties> <properties>
<elasticsearch.version>0.20.2</elasticsearch.version> <elasticsearch.version>0.90.0</elasticsearch.version>
</properties> </properties>
<repositories> <repositories>
@ -132,4 +132,4 @@
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
</project> </project>

View File

@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer; import org.wltea.analyzer.lucene.IKTokenizer;
//import org.wltea.lucene.IKTokenizer;
import java.io.Reader; import java.io.Reader;
public class IkAnalyzer extends Analyzer { public class IkAnalyzer extends Analyzer {
// private boolean isMaxWordLength = false;
@Override public TokenStream tokenStream(String fieldName, Reader reader) { // @Override public TokenStream tokenStream(String fieldName, Reader reader) {
return new IKTokenizer(reader,true); // return new IKTokenizer(reader,true);
} // }
public IkAnalyzer() { public IkAnalyzer() {
super(); super();
} }
@Override
protected TokenStreamComponents createComponents(String s, Reader reader) {
// new TokenStreamComponents
Tokenizer tokenizer = new IKTokenizer(reader, true);
return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates.
}
// public boolean isMaxWordLength() {
// return isMaxWordLength;
// }
} }

View File

@ -24,11 +24,16 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.*; import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/** /**
* *
@ -68,12 +73,12 @@ class AnalyzeContext {
private Map<Integer , LexemePath> pathMap; private Map<Integer , LexemePath> pathMap;
//最终分词结果集 //最终分词结果集
private LinkedList<Lexeme> results; private LinkedList<Lexeme> results;
private boolean useSmart;
//分词器配置项 //分词器配置项
private boolean useSmart; // private Configuration cfg;
public AnalyzeContext(boolean useSmart){ public AnalyzeContext(boolean useSmart){
this.useSmart = useSmart; this.useSmart = useSmart;
this.segmentBuff = new char[BUFF_SIZE]; this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE]; this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<String>(); this.buffLocker = new HashSet<String>();
@ -313,7 +318,7 @@ class AnalyzeContext {
while(result != null){ while(result != null){
//数量词合并 //数量词合并
this.compound(result); this.compound(result);
if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个 //是停止词继续取列表的下一个
result = this.results.pollFirst(); result = this.results.pollFirst();
}else{ }else{
@ -344,6 +349,7 @@ class AnalyzeContext {
* 组合词元 * 组合词元
*/ */
private void compound(Lexeme result){ private void compound(Lexeme result){
if(!this.useSmart){ if(!this.useSmart){
return ; return ;
} }

View File

@ -25,12 +25,12 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/** /**
* 中文-日韩文子分词器 * 中文-日韩文子分词器
@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列 //处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){ for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){ if(hit.isMatch()){
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//********************************* //*********************************
//再对当前指针位置的字符进行单字匹配 //再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词 if(singleCharHit.isMatch()){//首字成词
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);

View File

@ -24,14 +24,14 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/** /**
* *
* 中文数量词子分词器 * 中文数量词子分词器
@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//处理词段队列 //处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){ for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){ if(hit.isMatch()){
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//********************************* //*********************************
//对当前指针位置的字符进行单字匹配 //对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词 if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);

View File

@ -38,7 +38,7 @@ class IKArbitrator {
/** /**
* 分词歧义处理 * 分词歧义处理
* @param orgLexemes // * @param orgLexemes
* @param useSmart * @param useSmart
*/ */
void process(AnalyzeContext context , boolean useSmart){ void process(AnalyzeContext context , boolean useSmart){
@ -87,7 +87,6 @@ class IKArbitrator {
* 歧义识别 * 歧义识别
* @param lexemeCell 歧义路径链表头 * @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度 * @param fullTextLength 歧义路径文本长度
* @param option 候选结果路径
* @return * @return
*/ */
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
@ -120,7 +119,7 @@ class IKArbitrator {
/** /**
* 向前遍历添加词元构造一个无歧义词元组合 * 向前遍历添加词元构造一个无歧义词元组合
* @param LexemePath path // * @param LexemePath path
* @return * @return
*/ */
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
@ -140,7 +139,7 @@ class IKArbitrator {
/** /**
* 回滚词元链直到它能够接受指定的词元 * 回滚词元链直到它能够接受指定的词元
* @param lexeme // * @param lexeme
* @param l * @param l
*/ */
private void backPath(Lexeme l , LexemePath option){ private void backPath(Lexeme l , LexemePath option){

View File

@ -23,14 +23,15 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
//import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
/** /**
* IK分词器主类 * IK分词器主类
* *
@ -39,16 +40,18 @@ public final class IKSegmenter {
//字符窜reader //字符窜reader
private Reader input; private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文 //分词器上下文
private AnalyzeContext context; private AnalyzeContext context;
//分词处理器列表 //分词处理器列表
private List<ISegmenter> segmenters; private List<ISegmenter> segmenters;
//分词歧义裁决器 //分词歧义裁决器
private IKArbitrator arbitrator; private IKArbitrator arbitrator;
private ESLogger logger=null; private boolean useSmart = false;
private final boolean useSmart;
/** /**
* IK分词器构造函数 * IK分词器构造函数
* @param input * @param input
* @param useSmart 为true使用智能分词策略 * @param useSmart 为true使用智能分词策略
@ -57,16 +60,31 @@ public final class IKSegmenter {
* 智能分词 合并数词和量词对分词结果进行歧义判断 * 智能分词 合并数词和量词对分词结果进行歧义判断
*/ */
public IKSegmenter(Reader input , boolean useSmart){ public IKSegmenter(Reader input , boolean useSmart){
logger = Loggers.getLogger("ik-analyzer");
this.input = input; this.input = input;
// this.cfg = DefaultConfig.getInstance();
this.useSmart=useSmart; this.useSmart=useSmart;
this.init(); this.init();
}
/**
* IK分词器构造函数
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*
*/
public IKSegmenter(Reader input , Configuration cfg){
this.input = input;
this.cfg = cfg;
this.init();
} }
/** /**
* 初始化 * 初始化
*/ */
private void init(){ private void init(){
//初始化词典单例
// Dictionary.initial(this.cfg);
// Dictionary.getSingleton();
//初始化分词上下文 //初始化分词上下文
this.context = new AnalyzeContext(useSmart); this.context = new AnalyzeContext(useSmart);
//加载子分词器 //加载子分词器

View File

@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
/** /**
* 处理数字字母混合输出 * 处理数字字母混合输出
* windos2000 | linliangyi2005@gmail.com * windos2000 | linliangyi2005@gmail.com
* @param input // * @param input
* @param context * @param context
* @return * @return
*/ */

View File

@ -326,13 +326,5 @@ class DictSegment implements Comparable<DictSegment>{
//对当前节点存储的char进行比较 //对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar); return this.nodeChar.compareTo(o.nodeChar);
} }
public int getDicNum(){
if(charMap!=null)
{
return charMap.size();
}
return 0;
}
} }

View File

@ -1,74 +1,233 @@
/** /**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
* *
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
import java.io.*;
import java.util.Collection;
import java.util.List;
import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.cfg.Configuration;
import java.io.*; /**
import java.util.Collection; * 词典管理类,单子模式
import java.util.List; */
public class Dictionary { public class Dictionary {
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
public static final String PATH_DIC_PREP = "ik/preposition.dic";
public static final String PATH_DIC_STOP = "ik/stopword.dic";
private static final Dictionary singleton;
static{
singleton = new Dictionary();
}
/*
* 词典单子实例
*/
private static Dictionary singleton;
/*
* 主词典对象
*/
private DictSegment _MainDict; private DictSegment _MainDict;
private DictSegment _SurnameDict; /*
* 停止词词典
*/
private DictSegment _StopWordDict;
/*
* 量词词典
*/
private DictSegment _QuantifierDict; private DictSegment _QuantifierDict;
private DictSegment _SuffixDict; /**
* 配置对象
private DictSegment _PrepDict; */
private Configuration configuration;
private DictSegment _StopWords;
private Environment environment;
private Configuration configuration;
private ESLogger logger=null; private ESLogger logger=null;
private static boolean dictInited=false; private static boolean dictInited=false;
private Dictionary(){ private Environment environment;
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
public static final String PATH_DIC_PREP = "ik/preposition.dic";
public static final String PATH_DIC_STOP = "ik/stopword.dic";
private Dictionary(){
logger = Loggers.getLogger("ik-analyzer"); logger = Loggers.getLogger("ik-analyzer");
} }
static{
public Configuration getConfig(){ singleton = new Dictionary();
return configuration; }
} // public Configuration getConfig(){
// return configuration;
// }
// private Dictionary(Configuration cfg){
// this.cfg = cfg;
// this.loadMainDict();
// this.loadStopWordDict();
// this.loadQuantifierDict();
// }
public void Init(Settings indexSettings){ public void Init(Settings indexSettings){
if(!dictInited){ if(!dictInited){
environment =new Environment(indexSettings); environment =new Environment(indexSettings);
configuration=new Configuration(indexSettings); configuration=new Configuration(indexSettings);
loadMainDict(); loadMainDict();
loadSurnameDict(); // loadSurnameDict();
loadQuantifierDict(); loadQuantifierDict();
loadSuffixDict(); // loadSuffixDict();
loadPrepDict(); // loadPrepDict();
loadStopWordDict(); loadStopWordDict();
dictInited=true; dictInited=true;
} }
} }
/**
* 词典初始化
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时才会开始载入词典
* 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
* @return Dictionary
*/
// public static Dictionary initial(Configuration cfg){
// if(singleton == null){
// synchronized(Dictionary.class){
// if(singleton == null){
// singleton = new Dictionary();
// return singleton;
// }
// }
// }
// return singleton;
// }
/**
* 获取词典单子实例
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton(){
if(singleton == null){
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
}
return singleton;
}
/**
* 批量加载新词条
* @param words Collection<String>词条列表
*/
public void addWords(Collection<String> words){
if(words != null){
for(String word : words){
if (word != null) {
//批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 批量移除屏蔽词条
* @param words
*/
public void disableWords(Collection<String> words){
if(words != null){
for(String word : words){
if (word != null) {
//批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
* @param charArray
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 从已匹配的Hit中直接取出DictSegment继续向下匹配
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/**
* 加载主词典及扩展词典
*/
private void loadMainDict(){ private void loadMainDict(){
//建立一个主词典实例
_MainDict = new DictSegment((char)0); _MainDict = new DictSegment((char)0);
//读取主词典文件
File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN); File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN);
InputStream is = null; InputStream is = null;
@ -77,24 +236,21 @@ public class Dictionary {
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
e.printStackTrace(); e.printStackTrace();
} }
if(is == null){
throw new RuntimeException("Main Dictionary not found!!!");
}
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord; String theWord = null;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toCharArray()); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
logger.info("[Dict Loading] {},MainDict Size:{}",file.toString(),_MainDict.getDicNum());
} catch (IOException ioe) { } catch (IOException ioe) {
System.err.println("Main Dictionary loading exception."); System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace(); ioe.printStackTrace();
}finally{ }finally{
try { try {
if(is != null){ if(is != null){
@ -105,41 +261,42 @@ public class Dictionary {
e.printStackTrace(); e.printStackTrace();
} }
} }
//加载扩展词典
this.loadExtDict();
}
/**
* 加载用户配置的扩展词典到主词库表
*/
private void loadExtDict(){
//加载扩展词典配置
List<String> extDictFiles = configuration.getExtDictionarys(); List<String> extDictFiles = configuration.getExtDictionarys();
if(extDictFiles != null){ if(extDictFiles != null){
InputStream is = null;
for(String extDictName : extDictFiles){ for(String extDictName : extDictFiles){
//读取扩展词典文件
File tempFile=new File(environment.configFile(),extDictName); System.out.println("加载扩展词典:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
try { //如果找不到扩展的字典则忽略
is = new FileInputStream(tempFile); if(is == null){
} catch (FileNotFoundException e) {
e.printStackTrace();
logger.error("[Dict Loading]",e);
}
if(is == null){
continue; continue;
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord; String theWord = null;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中
//System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
logger.info("[Dict Loading] {},MainDict Size:{}",tempFile.toString(),_MainDict.getDicNum());
} catch (IOException ioe) { } catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception."); System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace(); ioe.printStackTrace();
}finally{ }finally{
try { try {
if(is != null){ if(is != null){
@ -151,77 +308,85 @@ public class Dictionary {
} }
} }
} }
} }
} }
/**
private void loadSurnameDict(){ * 加载用户扩展的停止词词典
*/
_SurnameDict = new DictSegment((char)0); private void loadStopWordDict(){
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME); //建立一个主词典实例
InputStream is = null; _StopWordDict = new DictSegment((char)0);
try { //加载扩展停止词典
is = new FileInputStream(file); List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
} catch (FileNotFoundException e) { if(extStopWordDictFiles != null){
e.printStackTrace(); InputStream is = null;
} for(String extStopWordDictName : extStopWordDictFiles){
if(is == null){ System.out.println("加载扩展停止词典:" + extStopWordDictName);
throw new RuntimeException("Surname Dictionary not found!!!"); //读取扩展词典文件
} is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
try { //如果找不到扩展的字典则忽略
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); if(is == null){
String theWord; continue;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SurnameDict.fillSegment(theWord.trim().toCharArray());
} }
} while (theWord != null); try {
logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum()); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
} catch (IOException ioe) { String theWord = null;
System.err.println("Surname Dictionary loading exception."); do {
ioe.printStackTrace(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
}finally{ //System.out.println(theWord);
try { //加载扩展停止词典数据到内存中
if(is != null){ _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
is.close(); }
is = null; } while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
} }
} catch (IOException e) {
e.printStackTrace();
} }
} }
} }
/**
* 加载量词词典
*/
private void loadQuantifierDict(){ private void loadQuantifierDict(){
//建立一个量词典实例
_QuantifierDict = new DictSegment((char)0); _QuantifierDict = new DictSegment((char)0);
//读取量词词典文件
File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER); File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null; InputStream is = null;
try { try {
is = new FileInputStream(file); is = new FileInputStream(file);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
e.printStackTrace(); e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Quantifier Dictionary not found!!!");
} }
try { try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord; String theWord = null;
do { do {
theWord = br.readLine(); theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) { if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toCharArray()); _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
} }
} while (theWord != null); } while (theWord != null);
logger.info("[Dict Loading] {},QuantifierDict Size:{}",file.toString(),_QuantifierDict.getDicNum());
} catch (IOException ioe) { } catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception."); System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace(); ioe.printStackTrace();
}finally{ }finally{
try { try {
if(is != null){ if(is != null){
@ -235,304 +400,8 @@ public class Dictionary {
} }
private void loadSuffixDict(){ public static Dictionary getInstance(){
return Dictionary.singleton;
_SuffixDict = new DictSegment((char)0); }
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Suffix Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void loadPrepDict(){
_PrepDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Preposition Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_PrepDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Preposition Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void loadStopWordDict(){
_StopWords = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_STOP);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Stopword Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},Stopwords Size:{}",file.toString(),_StopWords.getDicNum());
} catch (IOException ioe) {
System.err.println("Stopword Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){
for(String extStopWordDictName : extStopWordDictFiles){
File tempFile=new File(environment.configFile(),extStopWordDictName);
try {
is = new FileInputStream(tempFile);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},Stopwords Size:{}",tempFile.toString(),_StopWords.getDicNum());
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
public static Dictionary getInstance(){
return Dictionary.singleton;
}
public static void loadExtendWords(Collection<String> extWords){
if(extWords != null){
for(String extWord : extWords){
if (extWord != null) {
singleton._MainDict.fillSegment(extWord.trim().toCharArray());
}
}
}
}
public static void loadExtendStopWords(Collection<String> extStopWords){
if(extStopWords != null){
for(String extStopWord : extStopWords){
if (extStopWord != null) {
singleton._StopWords.fillSegment(extStopWord.trim().toCharArray());
}
}
}
}
public static Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
public static Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
public static Hit matchInMainDictWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
public static Hit matchInSurnameDict(char[] charArray , int begin, int length){
return singleton._SurnameDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 检索匹配在后缀词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInSuffixDict(char[] charArray , int begin, int length){
return singleton._SuffixDict.match(charArray, begin, length);
}
/**
* 检索匹配介词副词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInPrepDict(char[] charArray , int begin, int length){
return singleton._PrepDict.match(charArray, begin, length);
}
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public static boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWords.match(charArray, begin, length).isMatch();
}
} }

View File

@ -58,7 +58,9 @@ public class Hit {
public boolean isMatch() { public boolean isMatch() {
return (this.hitState & MATCH) > 0; return (this.hitState & MATCH) > 0;
} }
/**
*
*/
public void setMatch() { public void setMatch() {
this.hitState = this.hitState | MATCH; this.hitState = this.hitState | MATCH;
} }
@ -69,7 +71,9 @@ public class Hit {
public boolean isPrefix() { public boolean isPrefix() {
return (this.hitState & PREFIX) > 0; return (this.hitState & PREFIX) > 0;
} }
/**
*
*/
public void setPrefix() { public void setPrefix() {
this.hitState = this.hitState | PREFIX; this.hitState = this.hitState | PREFIX;
} }
@ -79,7 +83,9 @@ public class Hit {
public boolean isUnmatch() { public boolean isUnmatch() {
return this.hitState == UNMATCH ; return this.hitState == UNMATCH ;
} }
/**
*
*/
public void setUnmatch() { public void setUnmatch() {
this.hitState = UNMATCH; this.hitState = UNMATCH;
} }

View File

@ -1,51 +1,87 @@
/** /**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Dictionary;
import java.io.Reader; /**
* IK分词器Lucene Analyzer接口实现
public final class IKAnalyzer extends Analyzer { * 兼容Lucene 4.0版本
*/
public final class IKAnalyzer extends Analyzer{
private boolean isMaxWordLength = false; private boolean useSmart;
private boolean useSmart=false;
public boolean useSmart() {
return useSmart;
}
public IKAnalyzer(){ public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false); this(false);
} }
/**
public IKAnalyzer(boolean isMaxWordLength){ * IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super(); super();
this.setMaxWordLength(isMaxWordLength); this.useSmart = useSmart;
} }
public IKAnalyzer(Settings indexSetting,Settings settings1) { public IKAnalyzer(Settings indexSetting,Settings settings1) {
super(); super();
Dictionary.getInstance().Init(indexSetting); Dictionary.getInstance().Init(indexSetting);
if(settings1.get("use_smart", "true").equals("true")){ if(settings1.get("use_smart", "true").equals("true")){
useSmart=true; useSmart = true;
} }
} }
/**
@Override * 重载Analyzer接口构造分词组件
public TokenStream tokenStream(String fieldName, Reader reader) { */
return new IKTokenizer(reader , useSmart); @Override
} protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
public void setMaxWordLength(boolean isMaxWordLength) { return new TokenStreamComponents(_IKTokenizer);
this.isMaxWordLength = isMaxWordLength;
}
public boolean isMaxWordLength() {
return isMaxWordLength;
} }
} }

View File

@ -1,7 +1,7 @@
/** /**
* IK 中文分词 版本 5.0.1 * IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1 * IK Analyzer release 5.0.1
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
@ -20,94 +20,95 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/** /**
* IK分词器 Lucene Tokenizer适配器类 * IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本 * 兼容Lucene 4.0版本
*/ */
public final class IKTokenizer extends Tokenizer { public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
//IK分词器实现 /* (non-Javadoc)
private IKSegmenter _IKImplement; * @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
//词元文本属性 @Override
private final CharTermAttribute termAtt; public boolean incrementToken() throws IOException {
//词元位移属性 //清除所有的词元属性
private final OffsetAttribute offsetAtt; clearAttributes();
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量 Lexeme nextLexeme = _IKImplement.next();
private final TypeAttribute typeAtt; if(nextLexeme != null){
//记录最后一个词元的结束位置 //将Lexeme转成Attributes
private int endPosition; //设置词元文本
termAtt.append(nextLexeme.getLexemeText());
/** //设置词元长度
* Lucene 4.0 Tokenizer适配器类构造函数 termAtt.setLength(nextLexeme.getLength());
* @param in //设置词元位移
* @param useSmart offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
*/ //记录分词的最后位置
public IKTokenizer(Reader in , boolean useSmart){ endPosition = nextLexeme.getEndPosition();
super(in); //记录词元分类
offsetAtt = addAttribute(OffsetAttribute.class); typeAtt.setType(nextLexeme.getLexemeTypeString());
termAtt = addAttribute(CharTermAttribute.class); //返会true告知还有下个词元
typeAtt = addAttribute(TypeAttribute.class); return true;
_IKImplement = new IKSegmenter(input , useSmart); }
} //返会false告知词元输出完毕
return false;
/* (non-Javadoc) }
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/ /*
@Override * (non-Javadoc)
public boolean incrementToken() throws IOException { * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
//清除所有的词元属性 */
clearAttributes(); @Override
Lexeme nextLexeme = _IKImplement.next(); public void reset() throws IOException {
if(nextLexeme != null){ super.reset();
//将Lexeme转成Attributes _IKImplement.reset(input);
//设置词元文本 }
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度 @Override
termAtt.setLength(nextLexeme.getLength()); public final void end() {
//设置词元位移 // set final offset
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); int finalOffset = correctOffset(this.endPosition);
//记录分词的最后位置 offsetAtt.setOffset(finalOffset, finalOffset);
endPosition = nextLexeme.getEndPosition(); }
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
} }

View File

@ -1,153 +1,153 @@
///** /**
// * IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0 * IK Analyzer release 5.0
// * *
// * Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0 * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at * the License. You may obtain a copy of the License at
// * *
// * http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
// * *
// * Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
// * limitations under the License. * limitations under the License.
// * *
// * 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
// * *
// */ */
//package org.wltea.analyzer.query; package org.wltea.analyzer.query;
//
//import java.io.IOException; import java.io.IOException;
//import java.io.StringReader; import java.io.StringReader;
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//
//import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
//import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
//import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
//import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
//import org.wltea.analyzer.core.Lexeme; import org.wltea.analyzer.core.Lexeme;
//
///** /**
// * Single Word Multi Char Query Builder * Single Word Multi Char Query Builder
// * IK分词算法专用 * IK分词算法专用
// * @author linliangyi * @author linliangyi
// * *
// */ */
//public class SWMCQueryBuilder { public class SWMCQueryBuilder {
//
// /** /**
// * 生成SWMCQuery * 生成SWMCQuery
// * @param fieldName * @param fieldName
// * @param keywords * @param keywords
// * @param quickMode * @param quickMode
// * @return Lucene Query * @return Lucene Query
// */ */
// public static Query create(String fieldName ,String keywords , boolean quickMode){ public static Query create(String fieldName ,String keywords , boolean quickMode){
// if(fieldName == null || keywords == null){ if(fieldName == null || keywords == null){
// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
// } }
// //1.对keywords进行分词处理 //1.对keywords进行分词处理
// List<Lexeme> lexemes = doAnalyze(keywords); List<Lexeme> lexemes = doAnalyze(keywords);
// //2.根据分词结果生成SWMCQuery //2.根据分词结果生成SWMCQuery
// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
// return _SWMCQuery; return _SWMCQuery;
// } }
//
// /** /**
// * 分词切分并返回结链表 * 分词切分并返回结链表
// * @param keywords * @param keywords
// * @return * @return
// */ */
// private static List<Lexeme> doAnalyze(String keywords){ private static List<Lexeme> doAnalyze(String keywords){
// List<Lexeme> lexemes = new ArrayList<Lexeme>(); List<Lexeme> lexemes = new ArrayList<Lexeme>();
// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
// try{ try{
// Lexeme l = null; Lexeme l = null;
// while( (l = ikSeg.next()) != null){ while( (l = ikSeg.next()) != null){
// lexemes.add(l); lexemes.add(l);
// } }
// }catch(IOException e){ }catch(IOException e){
// e.printStackTrace(); e.printStackTrace();
// } }
// return lexemes; return lexemes;
// } }
//
//
// /** /**
// * 根据分词结果生成SWMC搜索 * 根据分词结果生成SWMC搜索
// * @param fieldName * @param fieldName
// * @param pathOption // * @param pathOption
// * @param quickMode * @param quickMode
// * @return * @return
// */ */
// private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){ private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
// //构造SWMC的查询表达式 //构造SWMC的查询表达式
// StringBuffer keywordBuffer = new StringBuffer(); StringBuffer keywordBuffer = new StringBuffer();
// //精简的SWMC的查询表达式 //精简的SWMC的查询表达式
// StringBuffer keywordBuffer_Short = new StringBuffer(); StringBuffer keywordBuffer_Short = new StringBuffer();
// //记录最后词元长度 //记录最后词元长度
// int lastLexemeLength = 0; int lastLexemeLength = 0;
// //记录最后词元结束位置 //记录最后词元结束位置
// int lastLexemeEnd = -1; int lastLexemeEnd = -1;
//
// int shortCount = 0; int shortCount = 0;
// int totalCount = 0; int totalCount = 0;
// for(Lexeme l : lexemes){ for(Lexeme l : lexemes){
// totalCount += l.getLength(); totalCount += l.getLength();
// //精简表达式 //精简表达式
// if(l.getLength() > 1){ if(l.getLength() > 1){
// keywordBuffer_Short.append(' ').append(l.getLexemeText()); keywordBuffer_Short.append(' ').append(l.getLexemeText());
// shortCount += l.getLength(); shortCount += l.getLength();
// } }
//
// if(lastLexemeLength == 0){ if(lastLexemeLength == 0){
// keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
// }else if(lastLexemeLength == 1 && l.getLength() == 1 }else if(lastLexemeLength == 1 && l.getLength() == 1
// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻长度为一合并) && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻长度为一合并)
// keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
// }else{ }else{
// keywordBuffer.append(' ').append(l.getLexemeText()); keywordBuffer.append(' ').append(l.getLexemeText());
//
// } }
// lastLexemeLength = l.getLength(); lastLexemeLength = l.getLength();
// lastLexemeEnd = l.getEndPosition(); lastLexemeEnd = l.getEndPosition();
// } }
//
// //借助lucene queryparser 生成SWMC Query //借助lucene queryparser 生成SWMC Query
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
// qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// qp.setAutoGeneratePhraseQueries(true); qp.setAutoGeneratePhraseQueries(true);
//
// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
// try { try {
// //System.out.println(keywordBuffer.toString()); //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer_Short.toString()); Query q = qp.parse(keywordBuffer_Short.toString());
// return q; return q;
// } catch (ParseException e) { } catch (ParseException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
//
// }else{ }else{
// if(keywordBuffer.length() > 0){ if(keywordBuffer.length() > 0){
// try { try {
// //System.out.println(keywordBuffer.toString()); //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer.toString()); Query q = qp.parse(keywordBuffer.toString());
// return q; return q;
// } catch (ParseException e) { } catch (ParseException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
// } }
// } }
// return null; return null;
// } }
//} }

View File

@ -1,147 +1,147 @@
///** /**
// * IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0 * IK Analyzer release 5.0
// * *
// * Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0 * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at * the License. You may obtain a copy of the License at
// * *
// * http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
// * *
// * Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
// * limitations under the License. * limitations under the License.
// * *
// * 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012乌龙茶工作室 * 版权声明 2012乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
// * *
// * *
// */ */
//package org.wltea.analyzer.sample; package org.wltea.analyzer.sample;
//
//import java.io.IOException; import java.io.IOException;
//
//import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
//import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
//import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
//import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
//import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
//import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
//import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
//import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
//import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
//import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
//import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
//import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
//import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
//import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
//import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
//import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.LockObtainFailedException;
//import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
//import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
//import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKAnalyzer;
//
//
//
//
///** /**
// * 使用IKAnalyzer进行Lucene索引和查询的演示 * 使用IKAnalyzer进行Lucene索引和查询的演示
// * 2012-3-2 * 2012-3-2
// * *
// * 以下是结合Lucene4.0 API的写法 * 以下是结合Lucene4.0 API的写法
// * *
// */ */
//public class LuceneIndexAndSearchDemo { public class LuceneIndexAndSearchDemo {
//
//
// /** /**
// * 模拟 * 模拟
// * 创建一个单条记录的索引并对其进行搜索 * 创建一个单条记录的索引并对其进行搜索
// * @param args * @param args
// */ */
// public static void main(String[] args){ public static void main(String[] args){
// //Lucene Document的域名 //Lucene Document的域名
// String fieldName = "text"; String fieldName = "text";
// //检索内容 //检索内容
// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//
// //实例化IKAnalyzer分词器 //实例化IKAnalyzer分词器
// Analyzer analyzer = new IKAnalyzer(true); Analyzer analyzer = new IKAnalyzer(true);
//
// Directory directory = null; Directory directory = null;
// IndexWriter iwriter = null; IndexWriter iwriter = null;
// IndexReader ireader = null; IndexReader ireader = null;
// IndexSearcher isearcher = null; IndexSearcher isearcher = null;
// try { try {
// //建立内存索引对象 //建立内存索引对象
// directory = new RAMDirectory(); directory = new RAMDirectory();
//
// //配置IndexWriterConfig //配置IndexWriterConfig
// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
// iwriter = new IndexWriter(directory , iwConfig); iwriter = new IndexWriter(directory , iwConfig);
// //写入索引 //写入索引
// Document doc = new Document(); Document doc = new Document();
// doc.add(new StringField("ID", "10000", Field.Store.YES)); doc.add(new StringField("ID", "10000", Field.Store.YES));
// doc.add(new TextField(fieldName, text, Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES));
// iwriter.addDocument(doc); iwriter.addDocument(doc);
// iwriter.close(); iwriter.close();
//
//
// //搜索过程********************************** //搜索过程**********************************
// //实例化搜索器 //实例化搜索器
// ireader = DirectoryReader.open(directory); ireader = DirectoryReader.open(directory);
// isearcher = new IndexSearcher(ireader); isearcher = new IndexSearcher(ireader);
//
// String keyword = "中文分词工具包"; String keyword = "中文分词工具包";
// //使用QueryParser查询分析器构造Query对象 //使用QueryParser查询分析器构造Query对象
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
// qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// Query query = qp.parse(keyword); Query query = qp.parse(keyword);
// System.out.println("Query = " + query); System.out.println("Query = " + query);
//
// //搜索相似度最高的5条记录 //搜索相似度最高的5条记录
// TopDocs topDocs = isearcher.search(query , 5); TopDocs topDocs = isearcher.search(query , 5);
// System.out.println("命中:" + topDocs.totalHits); System.out.println("命中:" + topDocs.totalHits);
// //输出结果 //输出结果
// ScoreDoc[] scoreDocs = topDocs.scoreDocs; ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// for (int i = 0; i < topDocs.totalHits; i++){ for (int i = 0; i < topDocs.totalHits; i++){
// Document targetDoc = isearcher.doc(scoreDocs[i].doc); Document targetDoc = isearcher.doc(scoreDocs[i].doc);
// System.out.println("内容:" + targetDoc.toString()); System.out.println("内容:" + targetDoc.toString());
// } }
//
// } catch (CorruptIndexException e) { } catch (CorruptIndexException e) {
// e.printStackTrace(); e.printStackTrace();
// } catch (LockObtainFailedException e) { } catch (LockObtainFailedException e) {
// e.printStackTrace(); e.printStackTrace();
// } catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); e.printStackTrace();
// } catch (ParseException e) { } catch (ParseException e) {
// e.printStackTrace(); e.printStackTrace();
// } finally{ } finally{
// if(ireader != null){ if(ireader != null){
// try { try {
// ireader.close(); ireader.close();
// } catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
// } }
// if(directory != null){ if(directory != null){
// try { try {
// directory.close(); directory.close();
// } catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
// } }
// } }
// } }
//} }