remove keywordext.cpp/h out of src

2025-07-18 00:00:12 +08:00 · 2013-11-03 07:47:09 -08:00 · 2013-11-03 07:47:09 -08:00 · ae09f92b19
commit ae09f92b19
parent a6e6568b85
5 changed files with 2 additions and 486 deletions
--- a/src/cppjieba/CMakeLists.txt
+++ b/src/cppjieba/CMakeLists.txt
@ -1,4 +1,4 @@
-SET(LIBCPPJIEBA_SRC HMMSegment.cpp  KeyWordExt.cpp  MixSegment.cpp  MPSegment.cpp  Trie.cpp)
+SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp  MPSegment.cpp  Trie.cpp)

 INCLUDE_DIRECTORIES(../limonp)

@ -7,4 +7,4 @@ ADD_LIBRARY(cppjieba SHARED ${LIBCPPJIEBA_SRC})
 SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1)

 INSTALL(TARGETS cppjieba LIBRARY DESTINATION lib/CppJieba)
-INSTALL(FILES ChineseFilter.hpp HMMSegment.h KeyWordExt.h MPSegment.h structs.h Trie.h globals.h          ISegment.hpp  MixSegment.h  SegmentBase.hpp  TransCode.hpp  DESTINATION include/CppJieba)
+INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp  MixSegment.h  SegmentBase.hpp  TransCode.hpp  DESTINATION include/CppJieba)
--- a/src/cppjieba/KeyWordExt.cpp
+++ b/src/cppjieba/KeyWordExt.cpp
@ -1,360 +0,0 @@
-/************************************
- * file enc : ASCII
- * author   : wuyanyi09@gmail.com
-************************************/
-#include "KeyWordExt.h"
-
-
-namespace CppJieba
-{
-
-    KeyWordExt::KeyWordExt()
-    {
-    }
-    
-    KeyWordExt::~KeyWordExt()
-    {
-    }
-
-    bool KeyWordExt::init(const char* const segDictFile)
-    {
-        LogInfo("KeyWordExt init start ...");
-        if(!_segment.init(segDictFile))
-        {
-            LogError("_segment.init failed.");
-            return false;
-        }
-        return true;
-    }
-
-    bool KeyWordExt::loadStopWords(const char * const filePath)
-    {
-
-        LogInfo("_loadStopWords(%s) start", filePath);
-        if(!_stopWords.empty())
-        {
-            LogError("_stopWords has been loaded before! ");
-            return false;
-        }
-        if(!checkFileExist(filePath))
-        {
-            LogError("cann't find file[%s].",filePath);
-            return false;
-        }
-
-        ifstream ifile(filePath);
-        string line;
-        Unicode word;
-        while(getline(ifile, line))
-        {
-            if(!TransCode::decode(line, word))
-            {
-                LogError("decode failed .");
-                return false;
-            }
-            _stopWords.insert(word);
-        }
-        LogInfo("load stopwords[%d] finished.", _stopWords.size());
-        
-        return true;
-    }
-    
-    bool KeyWordExt::dispose()
-    {
-        _segment.dispose();
-        return true;
-    }
-
-    bool KeyWordExt::_wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b)
-    {
-        return a.weight > b.weight;
-    }
-
-    bool KeyWordExt::_sortWLIDF(vector<KeyWordInfo>& wordInfos)
-    {
-        for(uint i = 0; i < wordInfos.size(); i++)
-        {
-            KeyWordInfo& wInfo = wordInfos[i];
-            wInfo.idf = - wInfo.logFreq;
-            wInfo.weight = log(double(wInfo.word.size() + 1)) * wInfo.idf;
-        }
-        sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
-        return true;
-    }
-
-    bool KeyWordExt::_extTopN(vector<KeyWordInfo>& wordInfos, uint topN)
-    {
-        int dis = wordInfos.size() - topN;
-        if(dis <= 0)
-        {
-            return true;
-        }
-        
-        if(uint(dis) <= topN)
-        {
-            for(int i = 0; i< dis; i++)
-            {
-                wordInfos.pop_back();
-            }
-        }
-        else// in case that topN << size;
-        {
-            
-            vector<KeyWordInfo> tmp(wordInfos.begin(), wordInfos.begin() + topN);
-            wordInfos.swap(tmp);
-        }
-        return true;
-    }
-
-
-    bool KeyWordExt::extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN)
-    {
-        if(words.empty())
-        {
-            return false;
-        }
-
-        keyWordInfos.clear();
-        for(uint i = 0; i < words.size(); i++)
-        {
-            Unicode uniWord;
-            if(!TransCode::decode(words[i], uniWord))
-            {
-                LogError("decode failed");
-                return false;
-            }
-            keyWordInfos.push_back(uniWord);
-        }
-
-        return _extract(keyWordInfos, topN);
-    }
-
-    bool KeyWordExt::extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN)
-    {
-        if(title.empty())
-        {
-            return false;
-        }
-        
-        vector<TrieNodeInfo> trieNodeInfos; 
-        Unicode unico;
-        if(!TransCode::decode(title, unico))
-        {
-            return false;
-        }
-        _segment.cut(unico.begin(), unico.end(), trieNodeInfos);
-
-        keyWordInfos.clear();
-        for(uint i = 0; i < trieNodeInfos.size(); i++)
-        {
-            keyWordInfos.push_back(trieNodeInfos[i]);
-        }
-        return _extract(keyWordInfos, topN);
-    }
-
-    bool KeyWordExt::_extract(vector<KeyWordInfo>& keyWordInfos, uint topN)
-    {
-        if(!_filter(keyWordInfos))
-        {
-            LogError("_filter failed.");
-            return false;
-        }
-
-        if(!_sortWLIDF(keyWordInfos))
-        {
-            LogError("_sortWLIDF failed.");
-            return false;
-        }
-
-        if(!_extTopN(keyWordInfos, topN))
-        {
-            LogError("_extTopN failed.");
-            return false;
-        }
-
-        return true;
-    }
-
-    bool KeyWordExt::_filter(vector<KeyWordInfo>& wordInfos)
-    {
-        if(!_filterDuplicate(wordInfos))
-        {
-            LogError("_filterDuplicate failed.");
-            return false;
-        }
-
-        if(!_filterSingleWord(wordInfos))
-        {
-            LogError("_filterSingleWord failed.");
-            return false;
-        }
-
-        if(!_filterStopWords(wordInfos))
-        {
-            LogError("_filterStopWords failed.");
-            return false;
-        }
-
-        if(!_filterSubstr(wordInfos))
-        {
-            LogError("_filterSubstr failed.");
-            return false;
-        }
-
-        return true;
-    }
-
-    bool KeyWordExt::_filterStopWords(vector<KeyWordInfo>& wordInfos)
-    {
-        if(_stopWords.empty())
-        {
-            return true;
-        }
-        for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
-        {
-            if(_stopWords.find(it->word) != _stopWords.end())
-            {
-                it = wordInfos.erase(it);
-            }
-            else
-            {
-                it ++;
-            }
-        }
-        return true;
-    }
-
-
-    bool KeyWordExt::_filterDuplicate(vector<KeyWordInfo>& wordInfos)
-    {
-        set<Unicode> st;
-        for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
-        {
-            if(st.find(it->word) != st.end())
-            {
-                it = wordInfos.erase(it);
-            }
-            else
-            {
-                st.insert(it->word);
-                it++;
-            }
-        }
-        return true;
-    }
-
-    bool KeyWordExt::_filterSingleWord(vector<KeyWordInfo>& wordInfos)
-    {
-        for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end();)
-        {
-
-            // filter single word
-            if(1 == it->word.size())
-            {
-                it = wordInfos.erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-        return true;
-    }
-
-    bool KeyWordExt::_filterSubstr(vector<KeyWordInfo>& wordInfos)
-    {
-        vector<Unicode> tmp ;
-        for(uint i = 0; i < wordInfos.size(); i++)
-        {
-            tmp.push_back(wordInfos[i].word);
-        }
-
-        for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
-        {
-            if(_isSubIn(tmp, it->word))
-            {
-                it = wordInfos.erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-
-        return true;
-    }
-
-    //bool KeyWordExt::_isContainSubWords(const string& word)
-    //{
-    //    for(uint i = 0; i < _priorSubWords.size(); i++)
-    //    {
-    //        if(string::npos != word.find(_priorSubWords[i]))
-    //        {
-    //            return true;
-    //        }
-    //    }
-    //    return false;
-    //}
-
-    //bool KeyWordExt::_prioritizeSubWords(vector<KeyWordInfo>& wordInfos)
-    //{
-    //    if(2 > wordInfos.size())
-    //    {
-    //        return true;
-    //    }
-
-    //    KeyWordInfo prior;
-    //    bool flag = false;
-    //    for(vector<KeyWordInfo>::iterator it = wordInfos.begin(); it != wordInfos.end(); )
-    //    {
-    //        if(_isContainSubWords(it->word))
-    //        {
-    //            prior = *it;
-    //            it = wordInfos.erase(it);
-    //            flag = true;
-    //            break;
-    //        }
-    //        else
-    //        {
-    //            it ++;
-    //        }
-    //    }
-    //    if(flag)
-    //    {
-    //        wordInfos.insert(wordInfos.begin(), prior);
-    //    }
-    //    return true;
-    //}
-}
-
-
-#ifdef KEYWORDEXT_UT
-
-using namespace CppJieba;
-
-int main()
-{
-    KeyWordExt ext;
-    ext.init();
-    if(!ext.loadSegDict("../dicts/segdict.gbk.v2.1"))
-    {
-        return 1;
-    }
-    ext._loadStopWords("../dicts/stopwords.gbk.v1.0");
-
-    ifstream ifile("testtitle.gbk");
-    vector<string> res;
-    string line;
-    while(getline(ifile, line))
-    {
-        cout<<line<<endl;
-        res.clear();
-        ext.extract(line, res, 20);
-        PRINT_VECTOR(res);
-    }
-
-    ext.dispose();
-    return 0;
-}
-
-#endif
--- a/src/cppjieba/KeyWordExt.h
+++ b/src/cppjieba/KeyWordExt.h
@ -1,68 +0,0 @@
-/************************************
- * file enc : ASCII
- * author   : wuyanyi09@gmail.com
- ************************************/
-#ifndef CPPJIEBA_KEYWORDEXT_H
-#define CPPJIEBA_KEYWORDEXT_H
-
-#include <logger.hpp>
-#include "MPSegment.h"
-#include "structs.h"
-
-namespace CppJieba
-{
-
-    class KeyWordExt
-    {
-        private:
-            MPSegment _segment;
-            //vector<string> _priorSubWords;
-            set<Unicode> _stopWords;
-        public:
-            KeyWordExt();
-            ~KeyWordExt();
-            bool init(const char* const segDictFile);
-            bool dispose();
-            bool loadStopWords(const char * const filePath);
-        private:
-            //bool _loadPriorSubWords(const char * const filePath);
-
-
-        public:
-            bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
-            bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
-        private:
-            static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
-        private:
-            bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
-            bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
-        private:
-            //sort by word len - idf
-            bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
-        private:
-            bool _filter(vector<KeyWordInfo>& );
-            bool _filterDuplicate(vector<KeyWordInfo>& );
-            bool _filterSingleWord(vector<KeyWordInfo>& );
-            bool _filterSubstr(vector<KeyWordInfo>& );
-            bool _filterStopWords(vector<KeyWordInfo>& );
-        private:
-            inline bool _isSubIn(const vector<Unicode>& words, const Unicode& word)const
-            {
-
-                for(uint j = 0; j < words.size(); j++)
-                {
-                    if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end()))
-                    {
-                        return true;
-                    }
-                }
-                return false;
-            }
-            //bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
-            //bool _isContainSubWords(const string& word);
-
-    };
-
-}
-
-#endif
--- a/src/keywordext.cpp
+++ b/src/keywordext.cpp
@ -1,56 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <ArgvContext.hpp>
-#include "../cppjieba/KeyWordExt.h"
-
-using namespace CppJieba;
-
-
-void testKeyWordExt(const char * dictPath, const char * filePath)
-{
-    KeyWordExt ext;
-    if(!ext.init(dictPath))
-    {
-        return;
-    }
-
-    ifstream ifile(filePath);
-    vector<KeyWordInfo> res;
-    string line;
-    while(getline(ifile, line))
-    {
-        res.clear();
-        if(!line.empty())
-        {
-            ext.extract(line, res, 20);
-            cout<<line<<'\n'<<res<<endl;
-        }
-
-    }
-    ext.dispose();
-}
-
-const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
-
-int main(int argc, char ** argv)
-{
-    if(2 > argc)
-    {
-        cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
-            <<"options:\n"
-            <<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
-            <<"examples:\n"
-            <<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
-            <<endl;
-        return -1;
-    }
-
-    ArgvContext arg(argc, argv);
-    string dictPath = arg["--dictpath"];
-    if("" == dictPath)
-    {
-        dictPath = DEFAULT_DICTPATH;
-    }
-    testKeyWordExt(dictPath.c_str(), arg[1].c_str());
-    return 0;
-}
--- a/test/testlines.utf8
+++ b/test/testlines.utf8