mirror of
https://gitee.com/clygintang/Dockfile-Coreseek.git
synced 2025-07-21 00:00:15 +08:00
1196 lines
26 KiB
C++
Executable File
1196 lines
26 KiB
C++
Executable File
//
|
|
// $Id$
|
|
//
|
|
|
|
//
|
|
// Copyright (c) 2001-2011, Andrew Aksyonoff
|
|
// Copyright (c) 2008-2011, Sphinx Technologies Inc
|
|
// All rights reserved
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License. You should have
|
|
// received a copy of the GPL license along with this program; if you
|
|
// did not, you can find it at http://www.gnu.org/
|
|
//
|
|
|
|
#include "sphinx.h"
|
|
#include "sphinxutils.h"
|
|
|
|
#include <locale.h>
|
|
|
|
const int MAX_STR_LENGTH = 512;
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
BYTE GetWordchar ( const char * & szSet )
|
|
{
|
|
if ( *szSet=='\\' )
|
|
{
|
|
if ( !szSet[1] || !szSet[2] || !szSet[3] )
|
|
return 0;
|
|
|
|
char szBuf[3];
|
|
memcpy ( szBuf, szSet+2, 2 );
|
|
szBuf[2] = 0;
|
|
|
|
char * szStop = NULL;
|
|
int iRes = strtol ( szBuf, &szStop, 16 );
|
|
if ( szStop!=szBuf+2 || iRes<0 || iRes>255 )
|
|
return 0;
|
|
|
|
szSet += 4;
|
|
return (BYTE) iRes;
|
|
}
|
|
|
|
return *szSet++;
|
|
}
|
|
|
|
|
|
bool IsInSet ( BYTE uLetter, const char * szSet )
|
|
{
|
|
if ( !szSet )
|
|
return false;
|
|
|
|
bool bInvert = ( *szSet=='^' );
|
|
if ( bInvert )
|
|
++szSet;
|
|
|
|
const char * szSep = strchr ( szSet, '-' );
|
|
bool bRange = ( szSep!=NULL );
|
|
|
|
if ( bRange )
|
|
{
|
|
BYTE uRange1 = GetWordchar ( szSet );
|
|
szSep++;
|
|
BYTE uRange2 = GetWordchar ( szSep );
|
|
|
|
if ( uLetter>=Min ( uRange1, uRange2 ) && uLetter<=Max ( uRange1, uRange2 ) )
|
|
return !bInvert;
|
|
|
|
} else
|
|
{
|
|
BYTE uChar = 0;
|
|
while ( ( uChar = GetWordchar ( szSet ) )!=0 )
|
|
if ( uChar==uLetter )
|
|
break;
|
|
|
|
bool bEnd = !uChar;
|
|
|
|
if ( bInvert && bEnd )
|
|
return true;
|
|
|
|
if ( !bInvert && !bEnd )
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool GetSetMinMax ( const char * szSet, BYTE & uMin, BYTE & uMax )
|
|
{
|
|
if ( !szSet || !*szSet )
|
|
return false;
|
|
|
|
uMin = GetWordchar ( szSet );
|
|
uMax = uMin;
|
|
|
|
BYTE uChar;
|
|
while ( ( uChar = GetWordchar ( szSet ) )!=0 )
|
|
if ( uChar!='-' )
|
|
{
|
|
uMin = Min ( uMin, uChar );
|
|
uMax = Max ( uMax, uChar );
|
|
}
|
|
|
|
if ( !uMin || !uMax )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
class CISpellDict
|
|
{
|
|
public:
|
|
struct CISpellDictWord
|
|
{
|
|
CSphString m_sWord;
|
|
CSphString m_sFlags;
|
|
};
|
|
|
|
bool Load ( const char * szFilename );
|
|
void IterateStart ();
|
|
const CISpellDictWord * IterateNext ();
|
|
|
|
private:
|
|
CSphVector < CISpellDictWord > m_dEntries;
|
|
int m_iIterator;
|
|
};
|
|
|
|
|
|
bool CISpellDict::Load ( const char * szFilename )
|
|
{
|
|
if ( !szFilename )
|
|
return false;
|
|
|
|
m_dEntries.Reset ();
|
|
m_dEntries.Reserve ( 131072 );
|
|
|
|
FILE * pFile = fopen ( szFilename, "rt" );
|
|
if ( !pFile )
|
|
return false;
|
|
|
|
char szWordBuffer [MAX_STR_LENGTH];
|
|
|
|
while ( !feof ( pFile ) )
|
|
{
|
|
char * szResult = fgets ( szWordBuffer, MAX_STR_LENGTH, pFile );
|
|
if ( !szResult )
|
|
break;
|
|
|
|
int iPos = strlen ( szWordBuffer ) - 1;
|
|
while ( iPos>=0 && isspace ( (unsigned char)szWordBuffer[iPos] ) )
|
|
szWordBuffer [iPos--] = '\0';
|
|
|
|
CISpellDictWord Word;
|
|
|
|
char * szPosition = strchr ( szWordBuffer, '/' );
|
|
if ( !szPosition )
|
|
{
|
|
szPosition = szWordBuffer;
|
|
while ( *szPosition && !isspace ( (unsigned char)*szPosition ) )
|
|
++szPosition;
|
|
|
|
*szPosition = '\0';
|
|
Word.m_sWord = szWordBuffer;
|
|
|
|
} else
|
|
{
|
|
*szPosition = '\0';
|
|
Word.m_sWord = szWordBuffer;
|
|
++szPosition;
|
|
char * szFlags = szPosition;
|
|
while ( *szPosition && !isspace ( (unsigned char)*szPosition ) )
|
|
++szPosition;
|
|
|
|
*szPosition = '\0';
|
|
Word.m_sFlags = szFlags;
|
|
}
|
|
|
|
m_dEntries.Add ( Word );
|
|
}
|
|
|
|
fclose ( pFile );
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void CISpellDict::IterateStart ()
|
|
{
|
|
m_iIterator = 0;
|
|
}
|
|
|
|
|
|
const CISpellDict::CISpellDictWord * CISpellDict::IterateNext ()
|
|
{
|
|
if ( m_iIterator>=m_dEntries.GetLength() )
|
|
return NULL;
|
|
|
|
return &m_dEntries [m_iIterator++];
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
enum RuleType_e
|
|
{
|
|
RULE_NONE,
|
|
RULE_PREFIXES,
|
|
RULE_SUFFIXES
|
|
};
|
|
|
|
|
|
class CISpellAffixRule
|
|
{
|
|
public:
|
|
CISpellAffixRule () {}
|
|
CISpellAffixRule ( RuleType_e eRule, char cFlag, bool bCrossProduct, char * szCondition, char * szStrip, char * szAppend );
|
|
|
|
bool Apply ( CSphString & sWord );
|
|
char Flag () const;
|
|
bool IsCrossProduct () const;
|
|
bool IsPrefix () const;
|
|
|
|
private:
|
|
RuleType_e m_eRule;
|
|
char m_cFlag;
|
|
bool m_bCrossProduct;
|
|
CSphString m_sCondition;
|
|
CSphString m_sStrip;
|
|
CSphString m_sAppend;
|
|
|
|
int m_iWordLen;
|
|
int m_iCondLen;
|
|
int m_iStripLen;
|
|
int m_iAppendLen;
|
|
|
|
bool CheckSuffix ( const CSphString & sWord ) const;
|
|
bool CheckPrefix ( const CSphString & sWord ) const;
|
|
bool StripAppendSuffix ( CSphString & sWord ) const;
|
|
bool StripAppendPrefix ( CSphString & sWord ) const;
|
|
};
|
|
|
|
|
|
CISpellAffixRule::CISpellAffixRule ( RuleType_e eRule, char cFlag, bool bCrossProduct, char * szCondition, char * szStrip, char * szAppend )
|
|
: m_eRule ( eRule )
|
|
, m_cFlag ( cFlag )
|
|
, m_bCrossProduct ( bCrossProduct )
|
|
, m_sCondition ( szCondition )
|
|
, m_sStrip ( szStrip )
|
|
, m_sAppend ( szAppend )
|
|
, m_iWordLen ( 0 )
|
|
{
|
|
m_iCondLen = szCondition ? strlen ( szCondition ) : 0;
|
|
m_iStripLen = szStrip ? strlen ( szStrip ) : 0;
|
|
m_iAppendLen = szAppend ? strlen ( szAppend ) : 0;
|
|
}
|
|
|
|
|
|
bool CISpellAffixRule::Apply ( CSphString & sWord )
|
|
{
|
|
if ( m_sCondition.IsEmpty () )
|
|
return true;
|
|
|
|
if ( sWord.IsEmpty () )
|
|
return false;
|
|
|
|
m_iWordLen = strlen ( sWord.cstr () );
|
|
|
|
bool bDotCond = ( m_sCondition=="." );
|
|
if ( m_eRule==RULE_SUFFIXES )
|
|
{
|
|
if ( !bDotCond && !CheckSuffix ( sWord ) )
|
|
return false;
|
|
|
|
if ( !StripAppendSuffix ( sWord ) )
|
|
return false;
|
|
} else
|
|
{
|
|
if ( !bDotCond && !CheckPrefix ( sWord ) )
|
|
return false;
|
|
|
|
if ( !StripAppendPrefix ( sWord ) )
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
bool CISpellAffixRule::CheckSuffix ( const CSphString & sWord ) const
|
|
{
|
|
int iCondI = m_iCondLen-1;
|
|
for ( int i=m_iWordLen-1; iCondI>=0 && i>=0; --i )
|
|
{
|
|
if ( m_sCondition.cstr()[iCondI]!=']' )
|
|
{
|
|
if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] )
|
|
return false;
|
|
|
|
--iCondI;
|
|
|
|
} else
|
|
{
|
|
int iRangeStart = -1;
|
|
|
|
for ( int j=iCondI; j>=0 && iRangeStart==-1; --j )
|
|
if ( m_sCondition.cstr()[j]=='[' )
|
|
iRangeStart = j;
|
|
|
|
if ( iRangeStart==-1 )
|
|
return false;
|
|
else
|
|
{
|
|
if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iRangeStart + 1, iCondI - iRangeStart - 1 ).cstr () ) )
|
|
return false;
|
|
|
|
iCondI = iRangeStart - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool CISpellAffixRule::StripAppendSuffix ( CSphString & sWord ) const
|
|
{
|
|
static char szTmp [ MAX_STR_LENGTH];
|
|
|
|
if ( !m_sStrip.IsEmpty () )
|
|
{
|
|
if ( m_iWordLen < m_iStripLen )
|
|
return false;
|
|
|
|
if ( strncmp ( sWord.cstr () + m_iWordLen - m_iStripLen, m_sStrip.cstr (), m_iStripLen ) )
|
|
return false;
|
|
}
|
|
|
|
strncpy ( szTmp, sWord.cstr (), m_iWordLen - m_iStripLen );
|
|
szTmp [m_iWordLen - m_iStripLen] = '\0';
|
|
|
|
if ( !m_sAppend.IsEmpty () )
|
|
strcat ( szTmp, m_sAppend.cstr () ); // NOLINT
|
|
|
|
sWord = szTmp;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool CISpellAffixRule::CheckPrefix ( const CSphString & sWord ) const
|
|
{
|
|
int iCondI = 0;
|
|
for ( int i = 0; iCondI < m_iCondLen && i < m_iWordLen; ++i )
|
|
{
|
|
if ( m_sCondition.cstr()[iCondI]!='[' )
|
|
{
|
|
if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] )
|
|
return false;
|
|
|
|
++iCondI;
|
|
|
|
} else
|
|
{
|
|
int iRangeEnd = -1;
|
|
|
|
for ( int j=iCondI; j<m_iCondLen && iRangeEnd==-1; ++j )
|
|
if ( m_sCondition.cstr()[j]==']' )
|
|
iRangeEnd = j;
|
|
|
|
if ( iRangeEnd==-1 )
|
|
return false;
|
|
else
|
|
{
|
|
if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iCondI + 1, iRangeEnd - iCondI - 1 ).cstr () ) )
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool CISpellAffixRule::StripAppendPrefix ( CSphString & sWord ) const
|
|
{
|
|
static char szTmp [MAX_STR_LENGTH];
|
|
|
|
if ( !m_sStrip.IsEmpty () )
|
|
{
|
|
const char * Pos = strstr ( sWord.cstr (), m_sStrip.cstr () );
|
|
if ( Pos!=sWord.cstr() )
|
|
return false;
|
|
}
|
|
|
|
if ( !m_sAppend.IsEmpty () )
|
|
strcpy ( szTmp, m_sAppend.cstr() ); // NOLINT
|
|
|
|
strncpy ( szTmp + m_iAppendLen, sWord.cstr () + m_iStripLen, m_iWordLen - m_iStripLen );
|
|
szTmp [m_iWordLen - m_iStripLen + m_iAppendLen] = '\0';
|
|
|
|
sWord = szTmp;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
char CISpellAffixRule::Flag () const
|
|
{
|
|
return m_cFlag;
|
|
}
|
|
|
|
bool CISpellAffixRule::IsCrossProduct () const
|
|
{
|
|
return m_bCrossProduct;
|
|
}
|
|
|
|
bool CISpellAffixRule::IsPrefix () const
|
|
{
|
|
return m_eRule==RULE_PREFIXES;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
enum AffixFormat_e
|
|
{
|
|
AFFIX_FORMAT_ISPELL = 0,
|
|
AFFIX_FORMAT_MYSPELL = 1,
|
|
AFFIX_FORMAT_UNKNOWN
|
|
};
|
|
|
|
const char * AffixFormatName[] =
|
|
{
|
|
"ISpell",
|
|
"MySpell"
|
|
};
|
|
|
|
class CISpellAffix
|
|
{
|
|
public:
|
|
CISpellAffix ( const char * szLocale, const char * szCharsetFile );
|
|
|
|
bool Load ( const char * szFilename );
|
|
CISpellAffixRule * GetRule ( int iRule );
|
|
int GetNumRules () const;
|
|
bool CheckCrosses () const;
|
|
|
|
private:
|
|
CSphVector < CISpellAffixRule > m_dRules;
|
|
|
|
char m_dCharset [256];
|
|
bool m_bFirstCaseConv;
|
|
CSphString m_sLocale;
|
|
CSphString m_sCharsetFile;
|
|
bool m_bCheckCrosses;
|
|
CSphLowercaser m_LowerCaser;
|
|
bool m_bUseLowerCaser;
|
|
bool m_bUseDictConversion;
|
|
|
|
bool AddToCharset ( char * szRangeL, char * szRangeU );
|
|
void AddCharPair ( BYTE uCharL, BYTE uCharU );
|
|
void Strip ( char * szText );
|
|
char ToLowerCase ( char cChar );
|
|
void LoadLocale ();
|
|
|
|
AffixFormat_e DetectFormat ( FILE * );
|
|
bool LoadISpell ( FILE * );
|
|
bool LoadMySpell ( FILE * );
|
|
};
|
|
|
|
|
|
CISpellAffix::CISpellAffix ( const char * szLocale, const char * szCharsetFile )
|
|
: m_bFirstCaseConv ( true )
|
|
, m_sLocale ( szLocale )
|
|
, m_sCharsetFile ( szCharsetFile )
|
|
, m_bCheckCrosses ( false )
|
|
, m_bUseLowerCaser ( false )
|
|
, m_bUseDictConversion ( false )
|
|
{
|
|
}
|
|
|
|
|
|
AffixFormat_e CISpellAffix::DetectFormat ( FILE * pFile )
|
|
{
|
|
char sBuffer [MAX_STR_LENGTH];
|
|
|
|
while ( !feof ( pFile ) )
|
|
{
|
|
char * sLine = fgets ( sBuffer, MAX_STR_LENGTH, pFile );
|
|
if ( !sLine )
|
|
break;
|
|
|
|
if ( !strncmp ( sLine, "SFX", 3 ) ) return AFFIX_FORMAT_MYSPELL;
|
|
if ( !strncmp ( sLine, "PFX", 3 ) ) return AFFIX_FORMAT_MYSPELL;
|
|
if ( !strncmp ( sLine, "REP", 3 ) ) return AFFIX_FORMAT_MYSPELL;
|
|
|
|
if ( !strncasecmp ( sLine, "prefixes", 8 ) ) return AFFIX_FORMAT_ISPELL;
|
|
if ( !strncasecmp ( sLine, "suffixes", 8 ) ) return AFFIX_FORMAT_ISPELL;
|
|
if ( !strncasecmp ( sLine, "flag", 4 ) ) return AFFIX_FORMAT_ISPELL;
|
|
}
|
|
|
|
return AFFIX_FORMAT_UNKNOWN;
|
|
}
|
|
|
|
|
|
bool CISpellAffix::Load ( const char * szFilename )
|
|
{
|
|
if ( !szFilename )
|
|
return false;
|
|
|
|
m_dRules.Reset ();
|
|
memset ( m_dCharset, 0, sizeof ( m_dCharset ) );
|
|
m_bFirstCaseConv = true;
|
|
m_bUseLowerCaser = false;
|
|
m_bUseDictConversion = false;
|
|
m_LowerCaser.Reset ();
|
|
|
|
FILE * pFile = fopen ( szFilename, "rt" );
|
|
if ( !pFile )
|
|
return false;
|
|
|
|
bool bResult = false;
|
|
AffixFormat_e eFormat = DetectFormat ( pFile );
|
|
if ( eFormat==AFFIX_FORMAT_UNKNOWN )
|
|
printf ( "Failed to detect affix file format\n" );
|
|
else
|
|
{
|
|
fseek ( pFile, SEEK_SET, 0 );
|
|
printf ( "Using %s affix file format\n", AffixFormatName[eFormat] );
|
|
switch ( eFormat )
|
|
{
|
|
case AFFIX_FORMAT_MYSPELL: bResult = LoadMySpell ( pFile ); break;
|
|
case AFFIX_FORMAT_ISPELL: bResult = LoadISpell ( pFile ); break;
|
|
case AFFIX_FORMAT_UNKNOWN: break;
|
|
}
|
|
}
|
|
fclose ( pFile );
|
|
|
|
bool bHaveCrossPrefix = false;
|
|
for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossPrefix; i++ )
|
|
if ( m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() )
|
|
bHaveCrossPrefix = true;
|
|
|
|
bool bHaveCrossSuffix = false;
|
|
for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossSuffix; i++ )
|
|
if ( !m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() )
|
|
bHaveCrossSuffix = true;
|
|
|
|
m_bCheckCrosses = bHaveCrossPrefix && bHaveCrossSuffix;
|
|
|
|
return bResult;
|
|
}
|
|
|
|
|
|
bool CISpellAffix::LoadISpell ( FILE * pFile )
|
|
{
|
|
char szBuffer [ MAX_STR_LENGTH ];
|
|
char szCondition [ MAX_STR_LENGTH ];
|
|
char szStrip [ MAX_STR_LENGTH ];
|
|
char szAppend [ MAX_STR_LENGTH ];
|
|
|
|
RuleType_e eRule = RULE_NONE;
|
|
char cFlag = '\0';
|
|
bool bCrossProduct = false;
|
|
int iLine = 0;
|
|
|
|
// TODO: parse all .aff character replacement commands
|
|
while ( !feof ( pFile ) )
|
|
{
|
|
char * szResult = fgets ( szBuffer, MAX_STR_LENGTH, pFile );
|
|
if ( !szResult )
|
|
break;
|
|
iLine++;
|
|
|
|
if ( !strncasecmp ( szBuffer, "prefixes", 8 ) )
|
|
{
|
|
eRule = RULE_PREFIXES;
|
|
continue;
|
|
}
|
|
|
|
if ( !strncasecmp ( szBuffer, "suffixes", 8 ) )
|
|
{
|
|
eRule = RULE_SUFFIXES;
|
|
continue;
|
|
}
|
|
|
|
if ( !strncasecmp ( szBuffer, "wordchars", 9 ) )
|
|
{
|
|
char * szStart = szBuffer + 9;
|
|
while ( *szStart && isspace ( (unsigned char) *szStart ) )
|
|
++szStart;
|
|
|
|
char * szRangeL = szStart;
|
|
while ( *szStart && !isspace ( (unsigned char) *szStart ) )
|
|
++szStart;
|
|
|
|
if ( !*szStart )
|
|
{
|
|
printf ( "WARNING: Line %d: invalid 'wordchars' statement\n", iLine );
|
|
continue;
|
|
}
|
|
|
|
*szStart = '\0';
|
|
++szStart;
|
|
|
|
while ( *szStart && isspace ( (unsigned char) *szStart ) )
|
|
++szStart;
|
|
|
|
char * szRangeU = szStart;
|
|
|
|
while ( *szStart && !isspace ( (unsigned char) *szStart ) )
|
|
++szStart;
|
|
|
|
*szStart = '\0';
|
|
|
|
if ( !AddToCharset ( szRangeL, szRangeU ) )
|
|
printf ( "WARNING: Line %d: cannot add to charset: '%s' '%s'\n", iLine, szRangeL, szRangeU );
|
|
|
|
continue;
|
|
}
|
|
|
|
if ( !strncasecmp ( szBuffer, "flag", 4 ) )
|
|
{
|
|
if ( eRule==RULE_NONE )
|
|
{
|
|
printf ( "WARNING: Line %d: 'flag' appears before preffixes or suffixes\n", iLine );
|
|
continue;
|
|
}
|
|
|
|
char * szStart = szBuffer + 4;
|
|
while ( *szStart && isspace ( (unsigned char) *szStart ) )
|
|
++szStart;
|
|
|
|
bCrossProduct = ( *szStart=='*' );
|
|
|
|
cFlag = bCrossProduct ? *(szStart + 1) : *(szStart);
|
|
continue;
|
|
}
|
|
|
|
if ( eRule==RULE_NONE )
|
|
continue;
|
|
|
|
char * szComment = strchr ( szBuffer, '#' );
|
|
if ( szComment )
|
|
*szComment = '\0';
|
|
|
|
if ( !* szBuffer )
|
|
continue;
|
|
|
|
szCondition[0] = '\0';
|
|
szStrip[0] = '\0';
|
|
szAppend[0] = '\0';
|
|
|
|
int nFields = sscanf ( szBuffer, "%[^>\n]>%[^,\n],%[^\n]", szCondition, szStrip, szAppend ); // NOLINT
|
|
|
|
Strip ( szCondition );
|
|
Strip ( szStrip );
|
|
Strip ( szAppend );
|
|
|
|
switch ( nFields )
|
|
{
|
|
case 2: // no optional strip-string
|
|
strcpy ( szAppend, szStrip ); // NOLINT
|
|
szStrip[0] = '\0';
|
|
break;
|
|
case 3: // all read
|
|
break;
|
|
default: // invalid repl
|
|
continue;
|
|
}
|
|
|
|
CISpellAffixRule Rule ( eRule, cFlag, bCrossProduct, szCondition, szStrip, szAppend );
|
|
m_dRules.Add ( Rule );
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool CISpellAffix::LoadMySpell ( FILE * pFile )
|
|
{
|
|
char sBuffer [MAX_STR_LENGTH];
|
|
char sCondition [MAX_STR_LENGTH];
|
|
char sRemove [MAX_STR_LENGTH];
|
|
char sAppend [MAX_STR_LENGTH];
|
|
|
|
RuleType_e eRule = RULE_NONE;
|
|
BYTE cFlag = 0;
|
|
BYTE cCombine = 0;
|
|
int iCount = 0, iLine = 0;
|
|
const char * sMode = 0;
|
|
|
|
while ( !feof ( pFile ) )
|
|
{
|
|
char * sLine = fgets ( sBuffer, MAX_STR_LENGTH, pFile );
|
|
if ( !sLine )
|
|
break;
|
|
++iLine;
|
|
|
|
// prefix and suffix rules
|
|
RuleType_e eNewRule = RULE_NONE;
|
|
if ( !strncmp ( sLine, "PFX", 3 ) )
|
|
{
|
|
eNewRule = RULE_PREFIXES;
|
|
sMode = "prefix";
|
|
|
|
} else if ( !strncmp ( sLine, "SFX", 3 ) )
|
|
{
|
|
eNewRule = RULE_SUFFIXES;
|
|
sMode = "suffix";
|
|
}
|
|
|
|
if ( eNewRule!=RULE_NONE )
|
|
{
|
|
sLine += 3;
|
|
while ( *sLine && isspace ( (unsigned char) *sLine ) )
|
|
++sLine;
|
|
|
|
if ( eNewRule!=eRule ) // new rule header
|
|
{
|
|
if ( iCount )
|
|
printf ( "WARNING: Line %d: Premature end of entries.\n", iLine );
|
|
|
|
if ( sscanf ( sLine, "%c %c %d", &cFlag, &cCombine, &iCount )!=3 ) // NOLINT
|
|
printf ( "WARNING; Line %d: Malformed %s header\n", iLine, sMode );
|
|
|
|
eRule = eNewRule;
|
|
|
|
} else // current rule continued
|
|
{
|
|
*sRemove = *sAppend = 0;
|
|
char cNewFlag;
|
|
if ( sscanf ( sLine, "%c %s %s %s", &cNewFlag, sRemove, sAppend, sCondition )==4 ) // NOLINT
|
|
{
|
|
if ( cNewFlag!=cFlag )
|
|
printf ( "WARNING: Line %d: Flag character mismatch\n", iLine );
|
|
|
|
if ( *sRemove=='0' && *(sRemove + 1)==0 ) *sRemove = 0;
|
|
if ( *sAppend=='0' && *(sAppend + 1)==0 ) *sAppend = 0;
|
|
|
|
CISpellAffixRule Rule ( eRule, cFlag, cCombine=='Y', sCondition, sRemove, sAppend );
|
|
m_dRules.Add ( Rule );
|
|
|
|
} else
|
|
printf ( "WARNING: Line %d: Malformed %s rule\n", iLine, sMode );
|
|
|
|
if ( !--iCount ) eRule = RULE_NONE;
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
CISpellAffixRule * CISpellAffix::GetRule ( int iRule )
|
|
{
|
|
return &m_dRules [iRule];
|
|
}
|
|
|
|
|
|
int CISpellAffix::GetNumRules () const
|
|
{
|
|
return m_dRules.GetLength ();
|
|
}
|
|
|
|
|
|
bool CISpellAffix::CheckCrosses () const
|
|
{
|
|
return m_bCheckCrosses;
|
|
}
|
|
|
|
|
|
bool CISpellAffix::AddToCharset ( char * szRangeL, char * szRangeU )
|
|
{
|
|
if ( !szRangeL || !szRangeU )
|
|
return false;
|
|
|
|
int iLengthL = strlen ( szRangeL );
|
|
int iLengthU = strlen ( szRangeU );
|
|
|
|
bool bSetL = ( iLengthL>0 && szRangeL[0]=='[' && szRangeL[iLengthL-1]==']' );
|
|
bool bSetR = ( iLengthU>0 && szRangeU[0]=='[' && szRangeU[iLengthU-1]==']' );
|
|
|
|
if ( bSetL!=bSetR )
|
|
return false;
|
|
|
|
if ( bSetL )
|
|
{
|
|
szRangeL [iLengthL - 1] = '\0';
|
|
szRangeL = szRangeL + 1;
|
|
|
|
szRangeU [iLengthU - 1] = '\0';
|
|
szRangeU = szRangeU + 1;
|
|
|
|
BYTE uMinL, uMaxL;
|
|
if ( !GetSetMinMax ( szRangeL, uMinL, uMaxL ) )
|
|
return false;
|
|
|
|
BYTE uMinU, uMaxU;
|
|
if ( !GetSetMinMax ( szRangeU, uMinU, uMaxU ) )
|
|
return false;
|
|
|
|
if ( ( uMaxU - uMinU )!=( uMaxL - uMinL ) )
|
|
return false;
|
|
|
|
for ( BYTE i=0; i<=( uMaxL - uMinL ); ++i )
|
|
if ( IsInSet ( uMinL + i, szRangeL ) && IsInSet ( uMinU + i, szRangeU ) )
|
|
AddCharPair ( uMinL + i, uMinU + i );
|
|
} else
|
|
{
|
|
if ( iLengthL > 4 || iLengthU > 4 )
|
|
return false;
|
|
|
|
const char * szL = szRangeL;
|
|
const char * szU = szRangeU;
|
|
AddCharPair ( GetWordchar(szL), GetWordchar(szU) );
|
|
}
|
|
|
|
m_bUseDictConversion = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void CISpellAffix::AddCharPair ( BYTE uCharL, BYTE uCharU )
|
|
{
|
|
m_dCharset [uCharU] = uCharL;
|
|
}
|
|
|
|
|
|
void CISpellAffix::Strip ( char * szText )
|
|
{
|
|
char * szIterator1 = szText;
|
|
char * szIterator2 = szText;
|
|
while ( *szIterator1 )
|
|
{
|
|
if ( !isspace ( (unsigned char) *szIterator1 ) && *szIterator1!='-' )
|
|
{
|
|
*szIterator2 = *szIterator1;
|
|
++szIterator2;
|
|
}
|
|
++szIterator1;
|
|
}
|
|
|
|
*szIterator2 = '\0';
|
|
|
|
while ( *szText )
|
|
{
|
|
*szText = ToLowerCase ( *szText );
|
|
++szText;
|
|
}
|
|
}
|
|
|
|
|
|
char CISpellAffix::ToLowerCase ( char cChar )
|
|
{
|
|
if ( m_bFirstCaseConv )
|
|
{
|
|
LoadLocale ();
|
|
m_bFirstCaseConv = false;
|
|
}
|
|
|
|
// dictionary conversion
|
|
if ( m_bUseDictConversion )
|
|
return m_dCharset [(BYTE) cChar] ? m_dCharset [(BYTE) cChar] : cChar;
|
|
|
|
// user-defined character mapping
|
|
if ( m_bUseLowerCaser )
|
|
{
|
|
char cResult = (char)m_LowerCaser.ToLower ( (BYTE) cChar );
|
|
return cResult ? cResult : cChar;
|
|
}
|
|
|
|
// user-specified code page conversion
|
|
return (char)tolower ( (BYTE)cChar ); // workaround for systems (eg. FreeBSD) which default to signed char. marvelous!
|
|
}
|
|
|
|
|
|
void CISpellAffix::LoadLocale ()
|
|
{
|
|
if ( m_bUseDictConversion )
|
|
printf ( "Using dictionary-defined character set\n" );
|
|
else
|
|
if ( !m_sCharsetFile.IsEmpty () )
|
|
{
|
|
FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" );
|
|
if ( pFile )
|
|
{
|
|
printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () );
|
|
|
|
const int MAX_CHARSET_LENGTH = 4096;
|
|
char szBuffer [MAX_CHARSET_LENGTH];
|
|
|
|
char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile );
|
|
if ( szResult )
|
|
{
|
|
CSphVector<CSphRemapRange> dRemaps;
|
|
if ( sphParseCharset ( szBuffer, dRemaps ) )
|
|
{
|
|
m_bUseLowerCaser = true;
|
|
m_LowerCaser.AddRemaps ( dRemaps, 0 );
|
|
} else
|
|
{
|
|
printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() );
|
|
}
|
|
} else
|
|
{
|
|
printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() );
|
|
}
|
|
|
|
fclose ( pFile );
|
|
|
|
} else
|
|
{
|
|
printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() );
|
|
}
|
|
|
|
} else
|
|
{
|
|
if ( !m_sLocale.IsEmpty () )
|
|
{
|
|
char dLocaleC[256], dLocaleUser[256];
|
|
|
|
setlocale ( LC_ALL, "C" );
|
|
for ( int i=0; i<256; i++ )
|
|
dLocaleC[i] = (char) tolower(i);
|
|
|
|
char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() );
|
|
if ( szLocale )
|
|
{
|
|
printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() );
|
|
|
|
for ( int i=0; i<256; i++ )
|
|
dLocaleUser[i] = (char) tolower(i);
|
|
|
|
if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) )
|
|
printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" );
|
|
} else
|
|
printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() );
|
|
} else
|
|
printf ( "WARNING: no character set specified\n" );
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
enum OutputMode_e
|
|
{
|
|
M_DEBUG,
|
|
M_DUPLICATES,
|
|
M_LAST,
|
|
M_EXACT_OR_LONGEST,
|
|
M_DEFAULT = M_EXACT_OR_LONGEST
|
|
};
|
|
|
|
const char * dModeName[] =
|
|
{
|
|
"debug",
|
|
"duplicates",
|
|
"last"
|
|
};
|
|
|
|
struct MapInfo_t
|
|
{
|
|
CSphString m_sWord;
|
|
char m_sRules[3];
|
|
};
|
|
|
|
struct WordLess
|
|
{
|
|
inline bool IsLess ( const char * a, const char * b ) const
|
|
{
|
|
return strcoll ( a, b ) < 0;
|
|
}
|
|
};
|
|
|
|
typedef CSphOrderedHash < CSphVector<MapInfo_t>, CSphString, CSphStrHashFunc, 100000, 13 > WordMap_t;
|
|
|
|
static void EmitResult ( WordMap_t & tMap , const CSphString & sFrom, const CSphString & sTo, char cRuleA = 0, char cRuleB = 0 )
|
|
{
|
|
if ( !tMap.Exists(sFrom) )
|
|
tMap.Add ( CSphVector<MapInfo_t>(), sFrom );
|
|
|
|
MapInfo_t tInfo;
|
|
tInfo.m_sWord = sTo;
|
|
tInfo.m_sRules[0] = cRuleA;
|
|
tInfo.m_sRules[1] = cRuleB;
|
|
tInfo.m_sRules[2] = 0;
|
|
tMap[sFrom].Add ( tInfo );
|
|
}
|
|
|
|
int main ( int iArgs, char ** dArgs )
|
|
{
|
|
OutputMode_e eMode = M_DEFAULT;
|
|
bool bUseCustomCharset = false;
|
|
CSphString sDict, sAffix, sLocale, sCharsetFile, sResult = "result.txt";
|
|
|
|
printf ( "spelldump, an ispell dictionary dumper\n\n" );
|
|
|
|
int i = 1;
|
|
for ( ; i < iArgs; i++ )
|
|
{
|
|
if ( !strcmp ( dArgs[i], "-c" ) )
|
|
{
|
|
if ( ++i==iArgs ) break;
|
|
bUseCustomCharset = true;
|
|
sCharsetFile = dArgs[i];
|
|
|
|
} else if ( !strcmp ( dArgs[i], "-m" ) )
|
|
{
|
|
if ( ++i==iArgs ) break;
|
|
char * sMode = dArgs[i];
|
|
|
|
if ( !strcmp ( sMode, "debug" ) ) { eMode = M_DEBUG; continue; }
|
|
if ( !strcmp ( sMode, "duplicates" ) ) { eMode = M_DUPLICATES; continue; }
|
|
if ( !strcmp ( sMode, "last" ) ) { eMode = M_LAST; continue; }
|
|
if ( !strcmp ( sMode, "default" ) ) { eMode = M_DEFAULT; continue; }
|
|
|
|
printf ( "Unrecognized mode: %s\n", sMode );
|
|
return 1;
|
|
|
|
} else
|
|
break;
|
|
}
|
|
|
|
switch ( iArgs - i )
|
|
{
|
|
case 4:
|
|
sLocale = dArgs[i + 3];
|
|
case 3:
|
|
sResult = dArgs[i + 2];
|
|
case 2:
|
|
sAffix = dArgs[i + 1];
|
|
sDict = dArgs[i];
|
|
break;
|
|
default:
|
|
printf ( "Usage: spelldump [options] <dictionary> <affix> [result] [locale-name]\n\n"
|
|
"Options:\n"
|
|
"-c <file>\tuse case convertion defined in <file>\n"
|
|
"-m <mode>\toutput (conflict resolution) mode:\n"
|
|
"\t\tdefault - try to guess the best way to resolve a conflict\n"
|
|
"\t\tlast - choose last entry\n"
|
|
"\t\tdebug - dump all mappings (with rules)\n"
|
|
"\t\tduplicates - dump duplicate mappings only (with rules)\n" );
|
|
if ( iArgs>1 )
|
|
{
|
|
printf ( "\n"
|
|
"Examples:\n"
|
|
"spelldump en.dict en.aff\n"
|
|
"spelldump ru.dict ru.aff ru.txt ru_RU.CP1251\n"
|
|
"spelldump ru.dict ru.aff ru.txt .1251\n" );
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
printf ( "Loading dictionary...\n" );
|
|
CISpellDict Dict;
|
|
if ( !Dict.Load ( sDict.cstr () ) )
|
|
sphDie ( "Error loading dictionary file '%s'\n", sDict.IsEmpty () ? "" : sDict.cstr () );
|
|
|
|
printf ( "Loading affix file...\n" );
|
|
CISpellAffix Affix ( sLocale.cstr (), bUseCustomCharset ? sCharsetFile.cstr () : NULL );
|
|
if ( !Affix.Load ( sAffix.cstr () ) )
|
|
sphDie ( "Error loading affix file '%s'\n", sAffix.IsEmpty () ? "" : sAffix.cstr () );
|
|
|
|
if ( sResult.IsEmpty () )
|
|
sphDie ( "No result file specified\n" );
|
|
|
|
FILE * pFile = fopen ( sResult.cstr (), "wt" );
|
|
if ( !pFile )
|
|
sphDie ( "Unable to open '%s' for writing\n", sResult.cstr () );
|
|
|
|
if ( eMode!=M_DEFAULT )
|
|
printf ( "Output mode: %s\n", dModeName[eMode] );
|
|
|
|
Dict.IterateStart ();
|
|
WordMap_t tWordMap;
|
|
const CISpellDict::CISpellDictWord * pWord = NULL;
|
|
int nDone = 0;
|
|
while ( ( pWord = Dict.IterateNext () )!=NULL )
|
|
{
|
|
EmitResult ( tWordMap, pWord->m_sWord, pWord->m_sWord );
|
|
|
|
if ( ( ++nDone % 10 )==0 )
|
|
{
|
|
printf ( "\rDictionary words processed: %d", nDone );
|
|
fflush ( stdout );
|
|
}
|
|
|
|
if ( pWord->m_sFlags.IsEmpty() )
|
|
continue;
|
|
|
|
CSphString sWord, sWordForCross;
|
|
int iFlagLen = strlen ( pWord->m_sFlags.cstr () );
|
|
for ( int iFlag1 = 0; iFlag1 < iFlagLen; ++iFlag1 )
|
|
for ( int iRule1 = 0; iRule1 < Affix.GetNumRules (); ++iRule1 )
|
|
{
|
|
CISpellAffixRule * pRule1 = Affix.GetRule ( iRule1 );
|
|
if ( pRule1->Flag()!=pWord->m_sFlags.cstr()[iFlag1] )
|
|
continue;
|
|
|
|
sWord = pWord->m_sWord;
|
|
|
|
if ( !pRule1->Apply ( sWord ) )
|
|
continue;
|
|
|
|
EmitResult ( tWordMap, sWord, pWord->m_sWord, pRule1->Flag() );
|
|
|
|
// apply other rules
|
|
if ( !Affix.CheckCrosses() )
|
|
continue;
|
|
|
|
if ( !pRule1->IsCrossProduct() )
|
|
continue;
|
|
|
|
for ( int iFlag2 = iFlag1 + 1; iFlag2 < iFlagLen; ++iFlag2 )
|
|
for ( int iRule2 = 0; iRule2 < Affix.GetNumRules (); ++iRule2 )
|
|
{
|
|
CISpellAffixRule * pRule2 = Affix.GetRule ( iRule2 );
|
|
if ( !pRule2->IsCrossProduct () || pRule2->Flag()!=pWord->m_sFlags.cstr()[iFlag2] ||
|
|
pRule2->IsPrefix()==pRule1->IsPrefix() )
|
|
continue;
|
|
|
|
sWordForCross = sWord;
|
|
if ( pRule2->Apply ( sWordForCross ) )
|
|
EmitResult ( tWordMap, sWordForCross, pWord->m_sWord, pRule1->Flag(), pRule2->Flag() );
|
|
}
|
|
}
|
|
}
|
|
printf ( "\rDictionary words processed: %d\n", nDone );
|
|
|
|
// output
|
|
|
|
CSphVector<const char *> dKeys;
|
|
tWordMap.IterateStart();
|
|
while ( tWordMap.IterateNext() )
|
|
dKeys.Add ( tWordMap.IterateGetKey().cstr() );
|
|
dKeys.Sort ( WordLess() );
|
|
|
|
ARRAY_FOREACH ( iKey, dKeys )
|
|
{
|
|
const CSphVector<MapInfo_t> & dWords = tWordMap[dKeys[iKey]];
|
|
const char * sKey = dKeys[iKey];
|
|
|
|
switch ( eMode )
|
|
{
|
|
case M_LAST:
|
|
fprintf ( pFile, "%s > %s\n", sKey, dWords.Last().m_sWord.cstr() );
|
|
break;
|
|
|
|
case M_EXACT_OR_LONGEST:
|
|
{
|
|
int iMatch = 0;
|
|
int iLength = 0;
|
|
|
|
ARRAY_FOREACH ( i, dWords )
|
|
{
|
|
if ( dWords[i].m_sWord==sKey )
|
|
{
|
|
iMatch = i;
|
|
break;
|
|
}
|
|
|
|
int iWordLength = strlen ( dWords[i].m_sWord.cstr() );
|
|
if ( iWordLength>iLength )
|
|
{
|
|
iLength = iWordLength;
|
|
iMatch = i;
|
|
}
|
|
}
|
|
|
|
fprintf ( pFile, "%s > %s\n", sKey, dWords[iMatch].m_sWord.cstr() );
|
|
break;
|
|
}
|
|
|
|
case M_DUPLICATES:
|
|
if ( dWords.GetLength()==1 ) break;
|
|
case M_DEBUG:
|
|
ARRAY_FOREACH ( i, dWords )
|
|
fprintf ( pFile, "%s > %s %s/%d\n", sKey, dWords[i].m_sWord.cstr(),
|
|
dWords[i].m_sRules, dWords.GetLength() );
|
|
break;
|
|
}
|
|
}
|
|
|
|
fclose ( pFile );
|
|
|
|
return 0;
|
|
}
|
|
|
|
//
|
|
// $Id$
|
|
//
|