Merge pull request #10 from wyhw/ik_lucene4

elasticsearch ik 0.20.x => 0.90.x
This commit is contained in:
Medcl 2013-05-12 23:14:42 -07:00
commit 43c8bc9f8c
16 changed files with 1520 additions and 1580 deletions

View File

@ -31,7 +31,7 @@
</parent>
<properties>
<elasticsearch.version>0.20.2</elasticsearch.version>
<elasticsearch.version>0.90.0</elasticsearch.version>
</properties>
<repositories>

View File

@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer;
//import org.wltea.lucene.IKTokenizer;
import java.io.Reader;
public class IkAnalyzer extends Analyzer {
@Override public TokenStream tokenStream(String fieldName, Reader reader) {
return new IKTokenizer(reader,true);
}
// private boolean isMaxWordLength = false;
// @Override public TokenStream tokenStream(String fieldName, Reader reader) {
// return new IKTokenizer(reader,true);
// }
public IkAnalyzer() {
super();
}
@Override
protected TokenStreamComponents createComponents(String s, Reader reader) {
// new TokenStreamComponents
Tokenizer tokenizer = new IKTokenizer(reader, true);
return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates.
}
// public boolean isMaxWordLength() {
// return isMaxWordLength;
// }
}

View File

@ -24,11 +24,16 @@
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/**
*
@ -68,12 +73,12 @@ class AnalyzeContext {
private Map<Integer , LexemePath> pathMap;
//最终分词结果集
private LinkedList<Lexeme> results;
private boolean useSmart;
//分词器配置项
private boolean useSmart;
// private Configuration cfg;
public AnalyzeContext(boolean useSmart){
this.useSmart = useSmart;
this.useSmart = useSmart;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<String>();
@ -313,7 +318,7 @@ class AnalyzeContext {
while(result != null){
//数量词合并
this.compound(result);
if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个
result = this.results.pollFirst();
}else{
@ -344,6 +349,7 @@ class AnalyzeContext {
* 组合词元
*/
private void compound(Lexeme result){
if(!this.useSmart){
return ;
}

View File

@ -25,12 +25,12 @@
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList;
import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/**
* 中文-日韩文子分词器
@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);

View File

@ -24,14 +24,14 @@
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/**
*
* 中文数量词子分词器
@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//*********************************
//对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);

View File

@ -38,7 +38,7 @@ class IKArbitrator {
/**
* 分词歧义处理
* @param orgLexemes
// * @param orgLexemes
* @param useSmart
*/
void process(AnalyzeContext context , boolean useSmart){
@ -87,7 +87,6 @@ class IKArbitrator {
* 歧义识别
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
* @param option 候选结果路径
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
@ -120,7 +119,7 @@ class IKArbitrator {
/**
* 向前遍历添加词元构造一个无歧义词元组合
* @param LexemePath path
// * @param LexemePath path
* @return
*/
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
@ -140,7 +139,7 @@ class IKArbitrator {
/**
* 回滚词元链直到它能够接受指定的词元
* @param lexeme
// * @param lexeme
* @param l
*/
private void backPath(Lexeme l , LexemePath option){

View File

@ -23,14 +23,15 @@
*/
package org.wltea.analyzer.core;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
//import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
/**
* IK分词器主类
*
@ -39,16 +40,18 @@ public final class IKSegmenter {
//字符窜reader
private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文
private AnalyzeContext context;
//分词处理器列表
private List<ISegmenter> segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
private ESLogger logger=null;
private final boolean useSmart;
private boolean useSmart = false;
/**
/**
* IK分词器构造函数
* @param input
* @param useSmart 为true使用智能分词策略
@ -57,16 +60,31 @@ public final class IKSegmenter {
* 智能分词 合并数词和量词对分词结果进行歧义判断
*/
public IKSegmenter(Reader input , boolean useSmart){
logger = Loggers.getLogger("ik-analyzer");
this.input = input;
// this.cfg = DefaultConfig.getInstance();
this.useSmart=useSmart;
this.init();
this.init();
}
/**
* IK分词器构造函数
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*
*/
public IKSegmenter(Reader input , Configuration cfg){
this.input = input;
this.cfg = cfg;
this.init();
}
/**
* 初始化
*/
private void init(){
//初始化词典单例
// Dictionary.initial(this.cfg);
// Dictionary.getSingleton();
//初始化分词上下文
this.context = new AnalyzeContext(useSmart);
//加载子分词器

View File

@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
/**
* 处理数字字母混合输出
* windos2000 | linliangyi2005@gmail.com
* @param input
// * @param input
* @param context
* @return
*/

View File

@ -327,12 +327,4 @@ class DictSegment implements Comparable<DictSegment>{
return this.nodeChar.compareTo(o.nodeChar);
}
public int getDicNum(){
if(charMap!=null)
{
return charMap.size();
}
return 0;
}
}

View File

@ -1,74 +1,233 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.dic;
import java.io.*;
import java.util.Collection;
import java.util.List;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
import java.io.*;
import java.util.Collection;
import java.util.List;
/**
* 词典管理类,单子模式
*/
public class Dictionary {
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
public static final String PATH_DIC_PREP = "ik/preposition.dic";
public static final String PATH_DIC_STOP = "ik/stopword.dic";
private static final Dictionary singleton;
static{
singleton = new Dictionary();
}
/*
* 词典单子实例
*/
private static Dictionary singleton;
/*
* 主词典对象
*/
private DictSegment _MainDict;
private DictSegment _SurnameDict;
/*
* 停止词词典
*/
private DictSegment _StopWordDict;
/*
* 量词词典
*/
private DictSegment _QuantifierDict;
private DictSegment _SuffixDict;
private DictSegment _PrepDict;
private DictSegment _StopWords;
private Environment environment;
private Configuration configuration;
/**
* 配置对象
*/
private Configuration configuration;
private ESLogger logger=null;
private static boolean dictInited=false;
private Dictionary(){
private Environment environment;
public static final String PATH_DIC_MAIN = "ik/main.dic";
public static final String PATH_DIC_SURNAME = "ik/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "ik/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "ik/suffix.dic";
public static final String PATH_DIC_PREP = "ik/preposition.dic";
public static final String PATH_DIC_STOP = "ik/stopword.dic";
private Dictionary(){
logger = Loggers.getLogger("ik-analyzer");
}
public Configuration getConfig(){
return configuration;
}
}
static{
singleton = new Dictionary();
}
// public Configuration getConfig(){
// return configuration;
// }
// private Dictionary(Configuration cfg){
// this.cfg = cfg;
// this.loadMainDict();
// this.loadStopWordDict();
// this.loadQuantifierDict();
// }
public void Init(Settings indexSettings){
if(!dictInited){
environment =new Environment(indexSettings);
configuration=new Configuration(indexSettings);
loadMainDict();
loadSurnameDict();
loadQuantifierDict();
loadSuffixDict();
loadPrepDict();
loadStopWordDict();
dictInited=true;
}
if(!dictInited){
environment =new Environment(indexSettings);
configuration=new Configuration(indexSettings);
loadMainDict();
// loadSurnameDict();
loadQuantifierDict();
// loadSuffixDict();
// loadPrepDict();
loadStopWordDict();
dictInited=true;
}
}
private void loadMainDict(){
_MainDict = new DictSegment((char)0);
/**
* 词典初始化
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时才会开始载入词典
* 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
* @return Dictionary
*/
// public static Dictionary initial(Configuration cfg){
// if(singleton == null){
// synchronized(Dictionary.class){
// if(singleton == null){
// singleton = new Dictionary();
// return singleton;
// }
// }
// }
// return singleton;
// }
/**
* 获取词典单子实例
* @return Dictionary 单例对象
*/
public static Dictionary getSingleton(){
if(singleton == null){
throw new IllegalStateException("词典尚未初始化请先调用initial方法");
}
return singleton;
}
/**
* 批量加载新词条
* @param words Collection<String>词条列表
*/
public void addWords(Collection<String> words){
if(words != null){
for(String word : words){
if (word != null) {
//批量加载词条到主内存词典中
singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 批量移除屏蔽词条
* @param words
*/
public void disableWords(Collection<String> words){
if(words != null){
for(String word : words){
if (word != null) {
//批量屏蔽词条
singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
* @param charArray
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 从已匹配的Hit中直接取出DictSegment继续向下匹配
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWordDict.match(charArray, begin, length).isMatch();
}
/**
* 加载主词典及扩展词典
*/
private void loadMainDict(){
//建立一个主词典实例
_MainDict = new DictSegment((char)0);
//读取主词典文件
File file= new File(environment.configFile(), Dictionary.PATH_DIC_MAIN);
InputStream is = null;
@ -77,20 +236,17 @@ public class Dictionary {
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Main Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toCharArray());
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},MainDict Size:{}",file.toString(),_MainDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
@ -105,37 +261,38 @@ public class Dictionary {
e.printStackTrace();
}
}
//加载扩展词典
this.loadExtDict();
}
/**
* 加载用户配置的扩展词典到主词库表
*/
private void loadExtDict(){
//加载扩展词典配置
List<String> extDictFiles = configuration.getExtDictionarys();
if(extDictFiles != null){
InputStream is = null;
for(String extDictName : extDictFiles){
File tempFile=new File(environment.configFile(),extDictName);
try {
is = new FileInputStream(tempFile);
} catch (FileNotFoundException e) {
e.printStackTrace();
logger.error("[Dict Loading]",e);
}
if(is == null){
//读取扩展词典文件
System.out.println("加载扩展词典:" + extDictName);
is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
//如果找不到扩展的字典则忽略
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中
//System.out.println(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},MainDict Size:{}",tempFile.toString(),_MainDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
@ -154,70 +311,78 @@ public class Dictionary {
}
}
private void loadSurnameDict(){
_SurnameDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SURNAME);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Surname Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SurnameDict.fillSegment(theWord.trim().toCharArray());
/**
* 加载用户扩展的停止词词典
*/
private void loadStopWordDict(){
//建立一个主词典实例
_StopWordDict = new DictSegment((char)0);
//加载扩展停止词典
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){
InputStream is = null;
for(String extStopWordDictName : extStopWordDictFiles){
System.out.println("加载扩展停止词典:" + extStopWordDictName);
//读取扩展词典文件
is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
//如果找不到扩展的字典则忽略
if(is == null){
continue;
}
} while (theWord != null);
logger.info("[Dict Loading] {},SurnameDict Size:{}",file.toString(),_SurnameDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Surname Dictionary loading exception.");
ioe.printStackTrace();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//System.out.println(theWord);
//加载扩展停止词典数据到内存中
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
}finally{
try {
if(is != null){
is.close();
is = null;
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 加载量词词典
*/
private void loadQuantifierDict(){
//建立一个量词典实例
_QuantifierDict = new DictSegment((char)0);
//读取量词词典文件
File file=new File(environment.configFile(),Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},QuantifierDict Size:{}",file.toString(),_QuantifierDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
@ -235,304 +400,8 @@ public class Dictionary {
}
private void loadSuffixDict(){
public static Dictionary getInstance(){
return Dictionary.singleton;
}
_SuffixDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},SuffixDict Size:{}",file.toString(),_SuffixDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Suffix Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void loadPrepDict(){
_PrepDict = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_PREP);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Preposition Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_PrepDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},PrepDict Size:{}",file.toString(),_PrepDict.getDicNum());
} catch (IOException ioe) {
System.err.println("Preposition Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void loadStopWordDict(){
_StopWords = new DictSegment((char)0);
File file=new File(environment.configFile(),Dictionary.PATH_DIC_STOP);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
throw new RuntimeException("Stopword Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},Stopwords Size:{}",file.toString(),_StopWords.getDicNum());
} catch (IOException ioe) {
System.err.println("Stopword Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){
for(String extStopWordDictName : extStopWordDictFiles){
File tempFile=new File(environment.configFile(),extStopWordDictName);
try {
is = new FileInputStream(tempFile);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
logger.info("[Dict Loading] {},Stopwords Size:{}",tempFile.toString(),_StopWords.getDicNum());
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
public static Dictionary getInstance(){
return Dictionary.singleton;
}
public static void loadExtendWords(Collection<String> extWords){
if(extWords != null){
for(String extWord : extWords){
if (extWord != null) {
singleton._MainDict.fillSegment(extWord.trim().toCharArray());
}
}
}
}
public static void loadExtendStopWords(Collection<String> extStopWords){
if(extStopWords != null){
for(String extStopWord : extStopWords){
if (extStopWord != null) {
singleton._StopWords.fillSegment(extStopWord.trim().toCharArray());
}
}
}
}
public static Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
public static Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
public static Hit matchInMainDictWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
public static Hit matchInSurnameDict(char[] charArray , int begin, int length){
return singleton._SurnameDict.match(charArray, begin, length);
}
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 检索匹配在后缀词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInSuffixDict(char[] charArray , int begin, int length){
return singleton._SuffixDict.match(charArray, begin, length);
}
/**
* 检索匹配介词副词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInPrepDict(char[] charArray , int begin, int length){
return singleton._PrepDict.match(charArray, begin, length);
}
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public static boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWords.match(charArray, begin, length).isMatch();
}
}

View File

@ -58,7 +58,9 @@ public class Hit {
public boolean isMatch() {
return (this.hitState & MATCH) > 0;
}
/**
*
*/
public void setMatch() {
this.hitState = this.hitState | MATCH;
}
@ -69,7 +71,9 @@ public class Hit {
public boolean isPrefix() {
return (this.hitState & PREFIX) > 0;
}
/**
*
*/
public void setPrefix() {
this.hitState = this.hitState | PREFIX;
}
@ -79,7 +83,9 @@ public class Hit {
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
/**
*
*/
public void setUnmatch() {
this.hitState = UNMATCH;
}

View File

@ -1,51 +1,87 @@
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.dic.Dictionary;
import java.io.Reader;
/**
* IK分词器Lucene Analyzer接口实现
* 兼容Lucene 4.0版本
*/
public final class IKAnalyzer extends Analyzer{
public final class IKAnalyzer extends Analyzer {
private boolean useSmart;
private boolean isMaxWordLength = false;
private boolean useSmart=false;
public boolean useSmart() {
return useSmart;
}
public IKAnalyzer(){
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false);
}
public IKAnalyzer(boolean isMaxWordLength){
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super();
this.setMaxWordLength(isMaxWordLength);
this.useSmart = useSmart;
}
public IKAnalyzer(Settings indexSetting,Settings settings1) {
super();
Dictionary.getInstance().Init(indexSetting);
Dictionary.getInstance().Init(indexSetting);
if(settings1.get("use_smart", "true").equals("true")){
useSmart=true;
useSmart = true;
}
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new IKTokenizer(reader , useSmart);
}
public void setMaxWordLength(boolean isMaxWordLength) {
this.isMaxWordLength = isMaxWordLength;
}
public boolean isMaxWordLength() {
return isMaxWordLength;
/**
* 重载Analyzer接口构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
return new TokenStreamComponents(_IKTokenizer);
}
}

View File

@ -26,88 +26,89 @@
*/
package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}

View File

@ -1,153 +1,153 @@
///**
// * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0
// *
// * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio
// *
// */
//package org.wltea.analyzer.query;
//
//import java.io.IOException;
//import java.io.StringReader;
//import java.util.ArrayList;
//import java.util.List;
//
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
//import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.Query;
//import org.apache.lucene.util.Version;
//import org.wltea.analyzer.core.IKSegmenter;
//import org.wltea.analyzer.core.Lexeme;
//
///**
// * Single Word Multi Char Query Builder
// * IK分词算法专用
// * @author linliangyi
// *
// */
//public class SWMCQueryBuilder {
//
// /**
// * 生成SWMCQuery
// * @param fieldName
// * @param keywords
// * @param quickMode
// * @return Lucene Query
// */
// public static Query create(String fieldName ,String keywords , boolean quickMode){
// if(fieldName == null || keywords == null){
// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
// }
// //1.对keywords进行分词处理
// List<Lexeme> lexemes = doAnalyze(keywords);
// //2.根据分词结果生成SWMCQuery
// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
// return _SWMCQuery;
// }
//
// /**
// * 分词切分并返回结链表
// * @param keywords
// * @return
// */
// private static List<Lexeme> doAnalyze(String keywords){
// List<Lexeme> lexemes = new ArrayList<Lexeme>();
// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
// try{
// Lexeme l = null;
// while( (l = ikSeg.next()) != null){
// lexemes.add(l);
// }
// }catch(IOException e){
// e.printStackTrace();
// }
// return lexemes;
// }
//
//
// /**
// * 根据分词结果生成SWMC搜索
// * @param fieldName
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.query;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* Single Word Multi Char Query Builder
* IK分词算法专用
* @author linliangyi
*
*/
public class SWMCQueryBuilder {
/**
* 生成SWMCQuery
* @param fieldName
* @param keywords
* @param quickMode
* @return Lucene Query
*/
public static Query create(String fieldName ,String keywords , boolean quickMode){
if(fieldName == null || keywords == null){
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
}
//1.对keywords进行分词处理
List<Lexeme> lexemes = doAnalyze(keywords);
//2.根据分词结果生成SWMCQuery
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
return _SWMCQuery;
}
/**
* 分词切分并返回结链表
* @param keywords
* @return
*/
private static List<Lexeme> doAnalyze(String keywords){
List<Lexeme> lexemes = new ArrayList<Lexeme>();
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
try{
Lexeme l = null;
while( (l = ikSeg.next()) != null){
lexemes.add(l);
}
}catch(IOException e){
e.printStackTrace();
}
return lexemes;
}
/**
* 根据分词结果生成SWMC搜索
* @param fieldName
// * @param pathOption
// * @param quickMode
// * @return
// */
// private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
// //构造SWMC的查询表达式
// StringBuffer keywordBuffer = new StringBuffer();
// //精简的SWMC的查询表达式
// StringBuffer keywordBuffer_Short = new StringBuffer();
// //记录最后词元长度
// int lastLexemeLength = 0;
// //记录最后词元结束位置
// int lastLexemeEnd = -1;
//
// int shortCount = 0;
// int totalCount = 0;
// for(Lexeme l : lexemes){
// totalCount += l.getLength();
// //精简表达式
// if(l.getLength() > 1){
// keywordBuffer_Short.append(' ').append(l.getLexemeText());
// shortCount += l.getLength();
// }
//
// if(lastLexemeLength == 0){
// keywordBuffer.append(l.getLexemeText());
// }else if(lastLexemeLength == 1 && l.getLength() == 1
// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻长度为一合并)
// keywordBuffer.append(l.getLexemeText());
// }else{
// keywordBuffer.append(' ').append(l.getLexemeText());
//
// }
// lastLexemeLength = l.getLength();
// lastLexemeEnd = l.getEndPosition();
// }
//
// //借助lucene queryparser 生成SWMC Query
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// qp.setAutoGeneratePhraseQueries(true);
//
// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
// try {
// //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer_Short.toString());
// return q;
// } catch (ParseException e) {
// e.printStackTrace();
// }
//
// }else{
// if(keywordBuffer.length() > 0){
// try {
// //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer.toString());
// return q;
// } catch (ParseException e) {
// e.printStackTrace();
// }
// }
// }
// return null;
// }
//}
* @param quickMode
* @return
*/
private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
//构造SWMC的查询表达式
StringBuffer keywordBuffer = new StringBuffer();
//精简的SWMC的查询表达式
StringBuffer keywordBuffer_Short = new StringBuffer();
//记录最后词元长度
int lastLexemeLength = 0;
//记录最后词元结束位置
int lastLexemeEnd = -1;
int shortCount = 0;
int totalCount = 0;
for(Lexeme l : lexemes){
totalCount += l.getLength();
//精简表达式
if(l.getLength() > 1){
keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength();
}
if(lastLexemeLength == 0){
keywordBuffer.append(l.getLexemeText());
}else if(lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻长度为一合并)
keywordBuffer.append(l.getLexemeText());
}else{
keywordBuffer.append(' ').append(l.getLexemeText());
}
lastLexemeLength = l.getLength();
lastLexemeEnd = l.getEndPosition();
}
//借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
try {
//System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer_Short.toString());
return q;
} catch (ParseException e) {
e.printStackTrace();
}
}else{
if(keywordBuffer.length() > 0){
try {
//System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer.toString());
return q;
} catch (ParseException e) {
e.printStackTrace();
}
}
}
return null;
}
}

View File

@ -1,147 +1,147 @@
///**
// * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0
// *
// * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio
// *
// *
// */
//package org.wltea.analyzer.sample;
//
//import java.io.IOException;
//
//import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.document.Document;
//import org.apache.lucene.document.Field;
//import org.apache.lucene.document.StringField;
//import org.apache.lucene.document.TextField;
//import org.apache.lucene.index.CorruptIndexException;
//import org.apache.lucene.index.DirectoryReader;
//import org.apache.lucene.index.IndexReader;
//import org.apache.lucene.index.IndexWriter;
//import org.apache.lucene.index.IndexWriterConfig;
//import org.apache.lucene.index.IndexWriterConfig.OpenMode;
//import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.IndexSearcher;
//import org.apache.lucene.search.Query;
//import org.apache.lucene.search.ScoreDoc;
//import org.apache.lucene.search.TopDocs;
//import org.apache.lucene.store.Directory;
//import org.apache.lucene.store.LockObtainFailedException;
//import org.apache.lucene.store.RAMDirectory;
//import org.apache.lucene.util.Version;
//import org.wltea.analyzer.lucene.IKAnalyzer;
//
//
//
//
///**
// * 使用IKAnalyzer进行Lucene索引和查询的演示
// * 2012-3-2
// *
// * 以下是结合Lucene4.0 API的写法
// *
// */
//public class LuceneIndexAndSearchDemo {
//
//
// /**
// * 模拟
// * 创建一个单条记录的索引并对其进行搜索
// * @param args
// */
// public static void main(String[] args){
// //Lucene Document的域名
// String fieldName = "text";
// //检索内容
// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//
// //实例化IKAnalyzer分词器
// Analyzer analyzer = new IKAnalyzer(true);
//
// Directory directory = null;
// IndexWriter iwriter = null;
// IndexReader ireader = null;
// IndexSearcher isearcher = null;
// try {
// //建立内存索引对象
// directory = new RAMDirectory();
//
// //配置IndexWriterConfig
// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
// iwriter = new IndexWriter(directory , iwConfig);
// //写入索引
// Document doc = new Document();
// doc.add(new StringField("ID", "10000", Field.Store.YES));
// doc.add(new TextField(fieldName, text, Field.Store.YES));
// iwriter.addDocument(doc);
// iwriter.close();
//
//
// //搜索过程**********************************
// //实例化搜索器
// ireader = DirectoryReader.open(directory);
// isearcher = new IndexSearcher(ireader);
//
// String keyword = "中文分词工具包";
// //使用QueryParser查询分析器构造Query对象
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// Query query = qp.parse(keyword);
// System.out.println("Query = " + query);
//
// //搜索相似度最高的5条记录
// TopDocs topDocs = isearcher.search(query , 5);
// System.out.println("命中:" + topDocs.totalHits);
// //输出结果
// ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// for (int i = 0; i < topDocs.totalHits; i++){
// Document targetDoc = isearcher.doc(scoreDocs[i].doc);
// System.out.println("内容:" + targetDoc.toString());
// }
//
// } catch (CorruptIndexException e) {
// e.printStackTrace();
// } catch (LockObtainFailedException e) {
// e.printStackTrace();
// } catch (IOException e) {
// e.printStackTrace();
// } catch (ParseException e) {
// e.printStackTrace();
// } finally{
// if(ireader != null){
// try {
// ireader.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
// }
// if(directory != null){
// try {
// directory.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
// }
// }
// }
//}
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.sample;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* 使用IKAnalyzer进行Lucene索引和查询的演示
* 2012-3-2
*
* 以下是结合Lucene4.0 API的写法
*
*/
public class LuceneIndexAndSearchDemo {
/**
* 模拟
* 创建一个单条记录的索引并对其进行搜索
* @param args
*/
public static void main(String[] args){
//Lucene Document的域名
String fieldName = "text";
//检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null;
IndexWriter iwriter = null;
IndexReader ireader = null;
IndexSearcher isearcher = null;
try {
//建立内存索引对象
directory = new RAMDirectory();
//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory , iwConfig);
//写入索引
Document doc = new Document();
doc.add(new StringField("ID", "10000", Field.Store.YES));
doc.add(new TextField(fieldName, text, Field.Store.YES));
iwriter.addDocument(doc);
iwriter.close();
//搜索过程**********************************
//实例化搜索器
ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader);
String keyword = "中文分词工具包";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query , 5);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < topDocs.totalHits; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} finally{
if(ireader != null){
try {
ireader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(directory != null){
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}