diff --git a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java b/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java deleted file mode 100644 index f79d834..0000000 --- a/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java +++ /dev/null @@ -1,716 +0,0 @@ -/** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * - */ -package org.wltea.analyzer.query; - -import org.apache.lucene.index.Term; -import org.apache.lucene.search.*; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.util.BytesRef; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Stack; - -/** - * IK简易查询表达式解析 - * 结合SWMCQuery算法 - * - * 表达式例子 : - * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword' - * @author linliangyi - * - */ -public class IKQueryExpressionParser { - - public static final ESLogger logger= Loggers.getLogger("ik-analyzer"); - - //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],"; - - private List elements = new ArrayList(); - - private Stack querys = new Stack(); - - private Stack operates = new Stack(); - - /** - * 解析查询表达式,生成Lucene Query对象 - * - * @param expression - * @param quickMode - * @return Lucene query - */ - public Query parseExp(String expression , boolean quickMode){ - Query lucenceQuery = null; - if(expression != null && !"".equals(expression)){ - try{ - //文法解析 - this.splitElements(expression); - //语法解析 - this.parseSyntax(quickMode); - if(this.querys.size() == 1){ - lucenceQuery = this.querys.pop(); - }else{ - throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失"); - } - }finally{ - elements.clear(); - querys.clear(); - operates.clear(); - } - } - return lucenceQuery; - } - - /** - * 表达式文法解析 - * @param expression - */ - private void splitElements(String expression){ - - if(expression == null){ - return; - } - Element curretElement = null; - - char[] expChars = expression.toCharArray(); - for(int i = 0 ; i < expChars.length ; i++){ - switch(expChars[i]){ - case '&' : - if(curretElement == null){ - curretElement = new Element(); - curretElement.type = '&'; - curretElement.append(expChars[i]); - }else if(curretElement.type == '&'){ - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - }else if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - }else { - this.elements.add(curretElement); - curretElement = new Element(); - curretElement.type = '&'; - curretElement.append(expChars[i]); - } - break; - - case '|' : - if(curretElement == null){ - curretElement = new Element(); - curretElement.type = '|'; - curretElement.append(expChars[i]); - }else if(curretElement.type == '|'){ - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - }else if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - }else { - this.elements.add(curretElement); - curretElement = new Element(); - curretElement.type = '|'; - curretElement.append(expChars[i]); - } - break; - - case '-' : - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = '-'; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case '(' : - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = '('; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case ')' : - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = ')'; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case ':' : - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = ':'; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case '=' : - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = '='; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case ' ' : - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - }else{ - this.elements.add(curretElement); - curretElement = null; - } - } - - break; - - case '\'' : - if(curretElement == null){ - curretElement = new Element(); - curretElement.type = '\''; - - }else if(curretElement.type == '\''){ - this.elements.add(curretElement); - curretElement = null; - - }else{ - this.elements.add(curretElement); - curretElement = new Element(); - curretElement.type = '\''; - - } - break; - - case '[': - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = '['; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case ']': - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = ']'; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - - break; - - case '{': - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = '{'; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - break; - - case '}': - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = '}'; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - - break; - case ',': - if(curretElement != null){ - if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - continue; - }else{ - this.elements.add(curretElement); - } - } - curretElement = new Element(); - curretElement.type = ','; - curretElement.append(expChars[i]); - this.elements.add(curretElement); - curretElement = null; - - break; - - default : - if(curretElement == null){ - curretElement = new Element(); - curretElement.type = 'F'; - curretElement.append(expChars[i]); - - }else if(curretElement.type == 'F'){ - curretElement.append(expChars[i]); - - }else if(curretElement.type == '\''){ - curretElement.append(expChars[i]); - - }else{ - this.elements.add(curretElement); - curretElement = new Element(); - curretElement.type = 'F'; - curretElement.append(expChars[i]); - } - } - } - - if(curretElement != null){ - this.elements.add(curretElement); - curretElement = null; - } - } - - /** - * 语法解析 - * - */ - private void parseSyntax(boolean quickMode){ - for(int i = 0 ; i < this.elements.size() ; i++){ - Element e = this.elements.get(i); - if('F' == e.type){ - Element e2 = this.elements.get(i + 1); - if('=' != e2.type && ':' != e2.type){ - throw new IllegalStateException("表达式异常: = 或 : 号丢失"); - } - Element e3 = this.elements.get(i + 2); - //处理 = 和 : 运算 - if('\'' == e3.type){ - i+=2; - if('=' == e2.type){ - TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString())); - this.querys.push(tQuery); - }else if(':' == e2.type){ - String keyword = e3.toString(); - //SWMCQuery Here - Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword , quickMode); - this.querys.push(_SWMCQuery); - } - - }else if('[' == e3.type || '{' == e3.type){ - i+=2; - //处理 [] 和 {} - LinkedList eQueue = new LinkedList(); - eQueue.add(e3); - for( i++ ; i < this.elements.size() ; i++){ - Element eN = this.elements.get(i); - eQueue.add(eN); - if(']' == eN.type || '}' == eN.type){ - break; - } - } - //翻译RangeQuery - Query rangeQuery = this.toTermRangeQuery(e , eQueue); - this.querys.push(rangeQuery); - }else{ - throw new IllegalStateException("表达式异常:匹配值丢失"); - } - - }else if('(' == e.type){ - this.operates.push(e); - - }else if(')' == e.type){ - boolean doPop = true; - while(doPop && !this.operates.empty()){ - Element op = this.operates.pop(); - if('(' == op.type){ - doPop = false; - }else { - Query q = toBooleanQuery(op); - this.querys.push(q); - } - - } - }else{ - - if(this.operates.isEmpty()){ - this.operates.push(e); - }else{ - boolean doPeek = true; - while(doPeek && !this.operates.isEmpty()){ - Element eleOnTop = this.operates.peek(); - if('(' == eleOnTop.type){ - doPeek = false; - this.operates.push(e); - }else if(compare(e , eleOnTop) == 1){ - this.operates.push(e); - doPeek = false; - }else if(compare(e , eleOnTop) == 0){ - Query q = toBooleanQuery(eleOnTop); - this.operates.pop(); - this.querys.push(q); - }else{ - Query q = toBooleanQuery(eleOnTop); - this.operates.pop(); - this.querys.push(q); - } - } - - if(doPeek && this.operates.empty()){ - this.operates.push(e); - } - } - } - } - - while(!this.operates.isEmpty()){ - Element eleOnTop = this.operates.pop(); - Query q = toBooleanQuery(eleOnTop); - this.querys.push(q); - } - } - - /** - * 根据逻辑操作符,生成BooleanQuery - * @param op - * @return - */ - private Query toBooleanQuery(Element op){ - if(this.querys.size() == 0){ - return null; - } - - BooleanQuery resultQuery = new BooleanQuery(); - - if(this.querys.size() == 1){ - return this.querys.get(0); - } - - Query q2 = this.querys.pop(); - Query q1 = this.querys.pop(); - if('&' == op.type){ - if(q1 != null){ - if(q1 instanceof BooleanQuery){ - BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); - if(clauses.length > 0 - && clauses[0].getOccur() == Occur.MUST){ - for(BooleanClause c : clauses){ - resultQuery.add(c); - } - }else{ - resultQuery.add(q1,Occur.MUST); - } - - }else{ - //q1 instanceof TermQuery - //q1 instanceof TermRangeQuery - //q1 instanceof PhraseQuery - //others - resultQuery.add(q1,Occur.MUST); - } - } - - if(q2 != null){ - if(q2 instanceof BooleanQuery){ - BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); - if(clauses.length > 0 - && clauses[0].getOccur() == Occur.MUST){ - for(BooleanClause c : clauses){ - resultQuery.add(c); - } - }else{ - resultQuery.add(q2,Occur.MUST); - } - - }else{ - //q1 instanceof TermQuery - //q1 instanceof TermRangeQuery - //q1 instanceof PhraseQuery - //others - resultQuery.add(q2,Occur.MUST); - } - } - - }else if('|' == op.type){ - if(q1 != null){ - if(q1 instanceof BooleanQuery){ - BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); - if(clauses.length > 0 - && clauses[0].getOccur() == Occur.SHOULD){ - for(BooleanClause c : clauses){ - resultQuery.add(c); - } - }else{ - resultQuery.add(q1,Occur.SHOULD); - } - - }else{ - //q1 instanceof TermQuery - //q1 instanceof TermRangeQuery - //q1 instanceof PhraseQuery - //others - resultQuery.add(q1,Occur.SHOULD); - } - } - - if(q2 != null){ - if(q2 instanceof BooleanQuery){ - BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); - if(clauses.length > 0 - && clauses[0].getOccur() == Occur.SHOULD){ - for(BooleanClause c : clauses){ - resultQuery.add(c); - } - }else{ - resultQuery.add(q2,Occur.SHOULD); - } - }else{ - //q2 instanceof TermQuery - //q2 instanceof TermRangeQuery - //q2 instanceof PhraseQuery - //others - resultQuery.add(q2,Occur.SHOULD); - - } - } - - }else if('-' == op.type){ - if(q1 == null || q2 == null){ - throw new IllegalStateException("表达式异常:SubQuery 个数不匹配"); - } - - if(q1 instanceof BooleanQuery){ - BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); - if(clauses.length > 0){ - for(BooleanClause c : clauses){ - resultQuery.add(c); - } - }else{ - resultQuery.add(q1,Occur.MUST); - } - - }else{ - //q1 instanceof TermQuery - //q1 instanceof TermRangeQuery - //q1 instanceof PhraseQuery - //others - resultQuery.add(q1,Occur.MUST); - } - - resultQuery.add(q2,Occur.MUST_NOT); - } - return resultQuery; - } - - /** - * 组装TermRangeQuery - * @param elements - * @return - */ - private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList elements){ - - boolean includeFirst = false; - boolean includeLast = false; - String firstValue = null; - String lastValue = null; - //检查第一个元素是否是[或者{ - Element first = elements.getFirst(); - if('[' == first.type){ - includeFirst = true; - }else if('{' == first.type){ - includeFirst = false; - }else { - throw new IllegalStateException("表达式异常"); - } - //检查最后一个元素是否是]或者} - Element last = elements.getLast(); - if(']' == last.type){ - includeLast = true; - }else if('}' == last.type){ - includeLast = false; - }else { - throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号"); - } - if(elements.size() < 4 || elements.size() > 5){ - throw new IllegalStateException("表达式异常, RangeQuery 错误"); - } - //读出中间部分 - Element e2 = elements.get(1); - if('\'' == e2.type){ - firstValue = e2.toString(); - // - Element e3 = elements.get(2); - if(',' != e3.type){ - throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔"); - } - // - Element e4 = elements.get(3); - if('\'' == e4.type){ - lastValue = e4.toString(); - }else if(e4 != last){ - throw new IllegalStateException("表达式异常,RangeQuery格式错误"); - } - }else if(',' == e2.type){ - firstValue = null; - // - Element e3 = elements.get(2); - if('\'' == e3.type){ - lastValue = e3.toString(); - }else{ - throw new IllegalStateException("表达式异常,RangeQuery格式错误"); - } - - }else { - throw new IllegalStateException("表达式异常, RangeQuery格式错误"); - } - - return new TermRangeQuery(fieldNameEle.toString() , new BytesRef(firstValue) , new BytesRef(lastValue) , includeFirst , includeLast); - } - - /** - * 比较操作符优先级 - * @param e1 - * @param e2 - * @return - */ - private int compare(Element e1 , Element e2){ - if('&' == e1.type){ - if('&' == e2.type){ - return 0; - }else { - return 1; - } - }else if('|' == e1.type){ - if('&' == e2.type){ - return -1; - }else if('|' == e2.type){ - return 0; - }else{ - return 1; - } - }else{ - if('-' == e2.type){ - return 0; - }else{ - return -1; - } - } - } - - /** - * 表达式元素(操作符、FieldName、FieldValue) - * @author linliangyi - * May 20, 2010 - */ - private class Element{ - char type = 0; - StringBuffer eleTextBuff; - - public Element(){ - eleTextBuff = new StringBuffer(); - } - - public void append(char c){ - this.eleTextBuff.append(c); - } - - public String toString(){ - return this.eleTextBuff.toString(); - } - } - - public static void main(String[] args){ - IKQueryExpressionParser parser = new IKQueryExpressionParser(); - //String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'"; - String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; - Query result = parser.parseExp(ikQueryExp , true); - logger.info(result.toString()); - - } - -} diff --git a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java b/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java deleted file mode 100644 index 9d730a9..0000000 --- a/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java +++ /dev/null @@ -1,158 +0,0 @@ -/** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * - */ -package org.wltea.analyzer.query; - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.Query; -import org.apache.lucene.util.Version; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.Loggers; -import org.wltea.analyzer.core.IKSegmenter; -import org.wltea.analyzer.core.Lexeme; - -/** - * Single Word Multi Char Query Builder - * IK分词算法专用 - * @author linliangyi - * - */ -public class SWMCQueryBuilder { - - public static ESLogger logger= Loggers.getLogger("ik-analyzer"); - - /** - * 生成SWMCQuery - * @param fieldName - * @param keywords - * @param quickMode - * @return Lucene Query - */ - public static Query create(String fieldName ,String keywords , boolean quickMode){ - if(fieldName == null || keywords == null){ - throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); - } - //1.对keywords进行分词处理 - List lexemes = doAnalyze(keywords); - //2.根据分词结果,生成SWMCQuery - Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); - return _SWMCQuery; - } - - /** - * 分词切分,并返回结链表 - * @param keywords - * @return - */ - private static List doAnalyze(String keywords){ - List lexemes = new ArrayList(); - - IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords),true); - try{ - Lexeme l = null; - while( (l = ikSeg.next()) != null){ - lexemes.add(l); - } - }catch(IOException e){ - logger.error(e.getMessage(), e); - } - return lexemes; - } - - - /** - * 根据分词结果生成SWMC搜索 - * @param fieldName - // * @param pathOption - * @param quickMode - * @return - */ - private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){ - //构造SWMC的查询表达式 - StringBuffer keywordBuffer = new StringBuffer(); - //精简的SWMC的查询表达式 - StringBuffer keywordBuffer_Short = new StringBuffer(); - //记录最后词元长度 - int lastLexemeLength = 0; - //记录最后词元结束位置 - int lastLexemeEnd = -1; - - int shortCount = 0; - int totalCount = 0; - for(Lexeme l : lexemes){ - totalCount += l.getLength(); - //精简表达式 - if(l.getLength() > 1){ - keywordBuffer_Short.append(' ').append(l.getLexemeText()); - shortCount += l.getLength(); - } - - if(lastLexemeLength == 0){ - keywordBuffer.append(l.getLexemeText()); - }else if(lastLexemeLength == 1 && l.getLength() == 1 - && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) - keywordBuffer.append(l.getLexemeText()); - }else{ - keywordBuffer.append(' ').append(l.getLexemeText()); - - } - lastLexemeLength = l.getLength(); - lastLexemeEnd = l.getEndPosition(); - } - - //借助lucene queryparser 生成SWMC Query - QueryParser qp = new QueryParser(fieldName, new StandardAnalyzer()); - qp.setDefaultOperator(QueryParser.AND_OPERATOR); - qp.setAutoGeneratePhraseQueries(true); - - if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ - try { - //System.out.println(keywordBuffer.toString()); - Query q = qp.parse(keywordBuffer_Short.toString()); - return q; - } catch (ParseException e) { - logger.error(e.getMessage(), e); - } - - }else{ - if(keywordBuffer.length() > 0){ - try { - //System.out.println(keywordBuffer.toString()); - Query q = qp.parse(keywordBuffer.toString()); - return q; - } catch (ParseException e) { - logger.error(e.getMessage(), e); - } - } - } - return null; - } -} \ No newline at end of file