2016-12-17 19:39:01 +08:00

1196 lines
26 KiB
C++
Executable File

//
// $Id$
//
//
// Copyright (c) 2001-2011, Andrew Aksyonoff
// Copyright (c) 2008-2011, Sphinx Technologies Inc
// All rights reserved
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License. You should have
// received a copy of the GPL license along with this program; if you
// did not, you can find it at http://www.gnu.org/
//
#include "sphinx.h"
#include "sphinxutils.h"
#include <locale.h>
const int MAX_STR_LENGTH = 512;
//////////////////////////////////////////////////////////////////////////
BYTE GetWordchar ( const char * & szSet )
{
if ( *szSet=='\\' )
{
if ( !szSet[1] || !szSet[2] || !szSet[3] )
return 0;
char szBuf[3];
memcpy ( szBuf, szSet+2, 2 );
szBuf[2] = 0;
char * szStop = NULL;
int iRes = strtol ( szBuf, &szStop, 16 );
if ( szStop!=szBuf+2 || iRes<0 || iRes>255 )
return 0;
szSet += 4;
return (BYTE) iRes;
}
return *szSet++;
}
bool IsInSet ( BYTE uLetter, const char * szSet )
{
if ( !szSet )
return false;
bool bInvert = ( *szSet=='^' );
if ( bInvert )
++szSet;
const char * szSep = strchr ( szSet, '-' );
bool bRange = ( szSep!=NULL );
if ( bRange )
{
BYTE uRange1 = GetWordchar ( szSet );
szSep++;
BYTE uRange2 = GetWordchar ( szSep );
if ( uLetter>=Min ( uRange1, uRange2 ) && uLetter<=Max ( uRange1, uRange2 ) )
return !bInvert;
} else
{
BYTE uChar = 0;
while ( ( uChar = GetWordchar ( szSet ) )!=0 )
if ( uChar==uLetter )
break;
bool bEnd = !uChar;
if ( bInvert && bEnd )
return true;
if ( !bInvert && !bEnd )
return true;
}
return false;
}
bool GetSetMinMax ( const char * szSet, BYTE & uMin, BYTE & uMax )
{
if ( !szSet || !*szSet )
return false;
uMin = GetWordchar ( szSet );
uMax = uMin;
BYTE uChar;
while ( ( uChar = GetWordchar ( szSet ) )!=0 )
if ( uChar!='-' )
{
uMin = Min ( uMin, uChar );
uMax = Max ( uMax, uChar );
}
if ( !uMin || !uMax )
return false;
return true;
}
//////////////////////////////////////////////////////////////////////////
class CISpellDict
{
public:
struct CISpellDictWord
{
CSphString m_sWord;
CSphString m_sFlags;
};
bool Load ( const char * szFilename );
void IterateStart ();
const CISpellDictWord * IterateNext ();
private:
CSphVector < CISpellDictWord > m_dEntries;
int m_iIterator;
};
bool CISpellDict::Load ( const char * szFilename )
{
if ( !szFilename )
return false;
m_dEntries.Reset ();
m_dEntries.Reserve ( 131072 );
FILE * pFile = fopen ( szFilename, "rt" );
if ( !pFile )
return false;
char szWordBuffer [MAX_STR_LENGTH];
while ( !feof ( pFile ) )
{
char * szResult = fgets ( szWordBuffer, MAX_STR_LENGTH, pFile );
if ( !szResult )
break;
int iPos = strlen ( szWordBuffer ) - 1;
while ( iPos>=0 && isspace ( (unsigned char)szWordBuffer[iPos] ) )
szWordBuffer [iPos--] = '\0';
CISpellDictWord Word;
char * szPosition = strchr ( szWordBuffer, '/' );
if ( !szPosition )
{
szPosition = szWordBuffer;
while ( *szPosition && !isspace ( (unsigned char)*szPosition ) )
++szPosition;
*szPosition = '\0';
Word.m_sWord = szWordBuffer;
} else
{
*szPosition = '\0';
Word.m_sWord = szWordBuffer;
++szPosition;
char * szFlags = szPosition;
while ( *szPosition && !isspace ( (unsigned char)*szPosition ) )
++szPosition;
*szPosition = '\0';
Word.m_sFlags = szFlags;
}
m_dEntries.Add ( Word );
}
fclose ( pFile );
return true;
}
void CISpellDict::IterateStart ()
{
m_iIterator = 0;
}
const CISpellDict::CISpellDictWord * CISpellDict::IterateNext ()
{
if ( m_iIterator>=m_dEntries.GetLength() )
return NULL;
return &m_dEntries [m_iIterator++];
}
//////////////////////////////////////////////////////////////////////////
enum RuleType_e
{
RULE_NONE,
RULE_PREFIXES,
RULE_SUFFIXES
};
class CISpellAffixRule
{
public:
CISpellAffixRule () {}
CISpellAffixRule ( RuleType_e eRule, char cFlag, bool bCrossProduct, char * szCondition, char * szStrip, char * szAppend );
bool Apply ( CSphString & sWord );
char Flag () const;
bool IsCrossProduct () const;
bool IsPrefix () const;
private:
RuleType_e m_eRule;
char m_cFlag;
bool m_bCrossProduct;
CSphString m_sCondition;
CSphString m_sStrip;
CSphString m_sAppend;
int m_iWordLen;
int m_iCondLen;
int m_iStripLen;
int m_iAppendLen;
bool CheckSuffix ( const CSphString & sWord ) const;
bool CheckPrefix ( const CSphString & sWord ) const;
bool StripAppendSuffix ( CSphString & sWord ) const;
bool StripAppendPrefix ( CSphString & sWord ) const;
};
CISpellAffixRule::CISpellAffixRule ( RuleType_e eRule, char cFlag, bool bCrossProduct, char * szCondition, char * szStrip, char * szAppend )
: m_eRule ( eRule )
, m_cFlag ( cFlag )
, m_bCrossProduct ( bCrossProduct )
, m_sCondition ( szCondition )
, m_sStrip ( szStrip )
, m_sAppend ( szAppend )
, m_iWordLen ( 0 )
{
m_iCondLen = szCondition ? strlen ( szCondition ) : 0;
m_iStripLen = szStrip ? strlen ( szStrip ) : 0;
m_iAppendLen = szAppend ? strlen ( szAppend ) : 0;
}
bool CISpellAffixRule::Apply ( CSphString & sWord )
{
if ( m_sCondition.IsEmpty () )
return true;
if ( sWord.IsEmpty () )
return false;
m_iWordLen = strlen ( sWord.cstr () );
bool bDotCond = ( m_sCondition=="." );
if ( m_eRule==RULE_SUFFIXES )
{
if ( !bDotCond && !CheckSuffix ( sWord ) )
return false;
if ( !StripAppendSuffix ( sWord ) )
return false;
} else
{
if ( !bDotCond && !CheckPrefix ( sWord ) )
return false;
if ( !StripAppendPrefix ( sWord ) )
return false;
}
return true;
}
bool CISpellAffixRule::CheckSuffix ( const CSphString & sWord ) const
{
int iCondI = m_iCondLen-1;
for ( int i=m_iWordLen-1; iCondI>=0 && i>=0; --i )
{
if ( m_sCondition.cstr()[iCondI]!=']' )
{
if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] )
return false;
--iCondI;
} else
{
int iRangeStart = -1;
for ( int j=iCondI; j>=0 && iRangeStart==-1; --j )
if ( m_sCondition.cstr()[j]=='[' )
iRangeStart = j;
if ( iRangeStart==-1 )
return false;
else
{
if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iRangeStart + 1, iCondI - iRangeStart - 1 ).cstr () ) )
return false;
iCondI = iRangeStart - 1;
}
}
}
return true;
}
bool CISpellAffixRule::StripAppendSuffix ( CSphString & sWord ) const
{
static char szTmp [ MAX_STR_LENGTH];
if ( !m_sStrip.IsEmpty () )
{
if ( m_iWordLen < m_iStripLen )
return false;
if ( strncmp ( sWord.cstr () + m_iWordLen - m_iStripLen, m_sStrip.cstr (), m_iStripLen ) )
return false;
}
strncpy ( szTmp, sWord.cstr (), m_iWordLen - m_iStripLen );
szTmp [m_iWordLen - m_iStripLen] = '\0';
if ( !m_sAppend.IsEmpty () )
strcat ( szTmp, m_sAppend.cstr () ); // NOLINT
sWord = szTmp;
return true;
}
bool CISpellAffixRule::CheckPrefix ( const CSphString & sWord ) const
{
int iCondI = 0;
for ( int i = 0; iCondI < m_iCondLen && i < m_iWordLen; ++i )
{
if ( m_sCondition.cstr()[iCondI]!='[' )
{
if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] )
return false;
++iCondI;
} else
{
int iRangeEnd = -1;
for ( int j=iCondI; j<m_iCondLen && iRangeEnd==-1; ++j )
if ( m_sCondition.cstr()[j]==']' )
iRangeEnd = j;
if ( iRangeEnd==-1 )
return false;
else
{
if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iCondI + 1, iRangeEnd - iCondI - 1 ).cstr () ) )
return false;
}
}
}
return true;
}
bool CISpellAffixRule::StripAppendPrefix ( CSphString & sWord ) const
{
static char szTmp [MAX_STR_LENGTH];
if ( !m_sStrip.IsEmpty () )
{
const char * Pos = strstr ( sWord.cstr (), m_sStrip.cstr () );
if ( Pos!=sWord.cstr() )
return false;
}
if ( !m_sAppend.IsEmpty () )
strcpy ( szTmp, m_sAppend.cstr() ); // NOLINT
strncpy ( szTmp + m_iAppendLen, sWord.cstr () + m_iStripLen, m_iWordLen - m_iStripLen );
szTmp [m_iWordLen - m_iStripLen + m_iAppendLen] = '\0';
sWord = szTmp;
return true;
}
char CISpellAffixRule::Flag () const
{
return m_cFlag;
}
bool CISpellAffixRule::IsCrossProduct () const
{
return m_bCrossProduct;
}
bool CISpellAffixRule::IsPrefix () const
{
return m_eRule==RULE_PREFIXES;
}
//////////////////////////////////////////////////////////////////////////
enum AffixFormat_e
{
AFFIX_FORMAT_ISPELL = 0,
AFFIX_FORMAT_MYSPELL = 1,
AFFIX_FORMAT_UNKNOWN
};
const char * AffixFormatName[] =
{
"ISpell",
"MySpell"
};
class CISpellAffix
{
public:
CISpellAffix ( const char * szLocale, const char * szCharsetFile );
bool Load ( const char * szFilename );
CISpellAffixRule * GetRule ( int iRule );
int GetNumRules () const;
bool CheckCrosses () const;
private:
CSphVector < CISpellAffixRule > m_dRules;
char m_dCharset [256];
bool m_bFirstCaseConv;
CSphString m_sLocale;
CSphString m_sCharsetFile;
bool m_bCheckCrosses;
CSphLowercaser m_LowerCaser;
bool m_bUseLowerCaser;
bool m_bUseDictConversion;
bool AddToCharset ( char * szRangeL, char * szRangeU );
void AddCharPair ( BYTE uCharL, BYTE uCharU );
void Strip ( char * szText );
char ToLowerCase ( char cChar );
void LoadLocale ();
AffixFormat_e DetectFormat ( FILE * );
bool LoadISpell ( FILE * );
bool LoadMySpell ( FILE * );
};
CISpellAffix::CISpellAffix ( const char * szLocale, const char * szCharsetFile )
: m_bFirstCaseConv ( true )
, m_sLocale ( szLocale )
, m_sCharsetFile ( szCharsetFile )
, m_bCheckCrosses ( false )
, m_bUseLowerCaser ( false )
, m_bUseDictConversion ( false )
{
}
AffixFormat_e CISpellAffix::DetectFormat ( FILE * pFile )
{
char sBuffer [MAX_STR_LENGTH];
while ( !feof ( pFile ) )
{
char * sLine = fgets ( sBuffer, MAX_STR_LENGTH, pFile );
if ( !sLine )
break;
if ( !strncmp ( sLine, "SFX", 3 ) ) return AFFIX_FORMAT_MYSPELL;
if ( !strncmp ( sLine, "PFX", 3 ) ) return AFFIX_FORMAT_MYSPELL;
if ( !strncmp ( sLine, "REP", 3 ) ) return AFFIX_FORMAT_MYSPELL;
if ( !strncasecmp ( sLine, "prefixes", 8 ) ) return AFFIX_FORMAT_ISPELL;
if ( !strncasecmp ( sLine, "suffixes", 8 ) ) return AFFIX_FORMAT_ISPELL;
if ( !strncasecmp ( sLine, "flag", 4 ) ) return AFFIX_FORMAT_ISPELL;
}
return AFFIX_FORMAT_UNKNOWN;
}
bool CISpellAffix::Load ( const char * szFilename )
{
if ( !szFilename )
return false;
m_dRules.Reset ();
memset ( m_dCharset, 0, sizeof ( m_dCharset ) );
m_bFirstCaseConv = true;
m_bUseLowerCaser = false;
m_bUseDictConversion = false;
m_LowerCaser.Reset ();
FILE * pFile = fopen ( szFilename, "rt" );
if ( !pFile )
return false;
bool bResult = false;
AffixFormat_e eFormat = DetectFormat ( pFile );
if ( eFormat==AFFIX_FORMAT_UNKNOWN )
printf ( "Failed to detect affix file format\n" );
else
{
fseek ( pFile, SEEK_SET, 0 );
printf ( "Using %s affix file format\n", AffixFormatName[eFormat] );
switch ( eFormat )
{
case AFFIX_FORMAT_MYSPELL: bResult = LoadMySpell ( pFile ); break;
case AFFIX_FORMAT_ISPELL: bResult = LoadISpell ( pFile ); break;
case AFFIX_FORMAT_UNKNOWN: break;
}
}
fclose ( pFile );
bool bHaveCrossPrefix = false;
for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossPrefix; i++ )
if ( m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() )
bHaveCrossPrefix = true;
bool bHaveCrossSuffix = false;
for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossSuffix; i++ )
if ( !m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() )
bHaveCrossSuffix = true;
m_bCheckCrosses = bHaveCrossPrefix && bHaveCrossSuffix;
return bResult;
}
bool CISpellAffix::LoadISpell ( FILE * pFile )
{
char szBuffer [ MAX_STR_LENGTH ];
char szCondition [ MAX_STR_LENGTH ];
char szStrip [ MAX_STR_LENGTH ];
char szAppend [ MAX_STR_LENGTH ];
RuleType_e eRule = RULE_NONE;
char cFlag = '\0';
bool bCrossProduct = false;
int iLine = 0;
// TODO: parse all .aff character replacement commands
while ( !feof ( pFile ) )
{
char * szResult = fgets ( szBuffer, MAX_STR_LENGTH, pFile );
if ( !szResult )
break;
iLine++;
if ( !strncasecmp ( szBuffer, "prefixes", 8 ) )
{
eRule = RULE_PREFIXES;
continue;
}
if ( !strncasecmp ( szBuffer, "suffixes", 8 ) )
{
eRule = RULE_SUFFIXES;
continue;
}
if ( !strncasecmp ( szBuffer, "wordchars", 9 ) )
{
char * szStart = szBuffer + 9;
while ( *szStart && isspace ( (unsigned char) *szStart ) )
++szStart;
char * szRangeL = szStart;
while ( *szStart && !isspace ( (unsigned char) *szStart ) )
++szStart;
if ( !*szStart )
{
printf ( "WARNING: Line %d: invalid 'wordchars' statement\n", iLine );
continue;
}
*szStart = '\0';
++szStart;
while ( *szStart && isspace ( (unsigned char) *szStart ) )
++szStart;
char * szRangeU = szStart;
while ( *szStart && !isspace ( (unsigned char) *szStart ) )
++szStart;
*szStart = '\0';
if ( !AddToCharset ( szRangeL, szRangeU ) )
printf ( "WARNING: Line %d: cannot add to charset: '%s' '%s'\n", iLine, szRangeL, szRangeU );
continue;
}
if ( !strncasecmp ( szBuffer, "flag", 4 ) )
{
if ( eRule==RULE_NONE )
{
printf ( "WARNING: Line %d: 'flag' appears before preffixes or suffixes\n", iLine );
continue;
}
char * szStart = szBuffer + 4;
while ( *szStart && isspace ( (unsigned char) *szStart ) )
++szStart;
bCrossProduct = ( *szStart=='*' );
cFlag = bCrossProduct ? *(szStart + 1) : *(szStart);
continue;
}
if ( eRule==RULE_NONE )
continue;
char * szComment = strchr ( szBuffer, '#' );
if ( szComment )
*szComment = '\0';
if ( !* szBuffer )
continue;
szCondition[0] = '\0';
szStrip[0] = '\0';
szAppend[0] = '\0';
int nFields = sscanf ( szBuffer, "%[^>\n]>%[^,\n],%[^\n]", szCondition, szStrip, szAppend ); // NOLINT
Strip ( szCondition );
Strip ( szStrip );
Strip ( szAppend );
switch ( nFields )
{
case 2: // no optional strip-string
strcpy ( szAppend, szStrip ); // NOLINT
szStrip[0] = '\0';
break;
case 3: // all read
break;
default: // invalid repl
continue;
}
CISpellAffixRule Rule ( eRule, cFlag, bCrossProduct, szCondition, szStrip, szAppend );
m_dRules.Add ( Rule );
}
return true;
}
bool CISpellAffix::LoadMySpell ( FILE * pFile )
{
char sBuffer [MAX_STR_LENGTH];
char sCondition [MAX_STR_LENGTH];
char sRemove [MAX_STR_LENGTH];
char sAppend [MAX_STR_LENGTH];
RuleType_e eRule = RULE_NONE;
BYTE cFlag = 0;
BYTE cCombine = 0;
int iCount = 0, iLine = 0;
const char * sMode = 0;
while ( !feof ( pFile ) )
{
char * sLine = fgets ( sBuffer, MAX_STR_LENGTH, pFile );
if ( !sLine )
break;
++iLine;
// prefix and suffix rules
RuleType_e eNewRule = RULE_NONE;
if ( !strncmp ( sLine, "PFX", 3 ) )
{
eNewRule = RULE_PREFIXES;
sMode = "prefix";
} else if ( !strncmp ( sLine, "SFX", 3 ) )
{
eNewRule = RULE_SUFFIXES;
sMode = "suffix";
}
if ( eNewRule!=RULE_NONE )
{
sLine += 3;
while ( *sLine && isspace ( (unsigned char) *sLine ) )
++sLine;
if ( eNewRule!=eRule ) // new rule header
{
if ( iCount )
printf ( "WARNING: Line %d: Premature end of entries.\n", iLine );
if ( sscanf ( sLine, "%c %c %d", &cFlag, &cCombine, &iCount )!=3 ) // NOLINT
printf ( "WARNING; Line %d: Malformed %s header\n", iLine, sMode );
eRule = eNewRule;
} else // current rule continued
{
*sRemove = *sAppend = 0;
char cNewFlag;
if ( sscanf ( sLine, "%c %s %s %s", &cNewFlag, sRemove, sAppend, sCondition )==4 ) // NOLINT
{
if ( cNewFlag!=cFlag )
printf ( "WARNING: Line %d: Flag character mismatch\n", iLine );
if ( *sRemove=='0' && *(sRemove + 1)==0 ) *sRemove = 0;
if ( *sAppend=='0' && *(sAppend + 1)==0 ) *sAppend = 0;
CISpellAffixRule Rule ( eRule, cFlag, cCombine=='Y', sCondition, sRemove, sAppend );
m_dRules.Add ( Rule );
} else
printf ( "WARNING: Line %d: Malformed %s rule\n", iLine, sMode );
if ( !--iCount ) eRule = RULE_NONE;
}
continue;
}
}
return true;
}
CISpellAffixRule * CISpellAffix::GetRule ( int iRule )
{
return &m_dRules [iRule];
}
int CISpellAffix::GetNumRules () const
{
return m_dRules.GetLength ();
}
bool CISpellAffix::CheckCrosses () const
{
return m_bCheckCrosses;
}
bool CISpellAffix::AddToCharset ( char * szRangeL, char * szRangeU )
{
if ( !szRangeL || !szRangeU )
return false;
int iLengthL = strlen ( szRangeL );
int iLengthU = strlen ( szRangeU );
bool bSetL = ( iLengthL>0 && szRangeL[0]=='[' && szRangeL[iLengthL-1]==']' );
bool bSetR = ( iLengthU>0 && szRangeU[0]=='[' && szRangeU[iLengthU-1]==']' );
if ( bSetL!=bSetR )
return false;
if ( bSetL )
{
szRangeL [iLengthL - 1] = '\0';
szRangeL = szRangeL + 1;
szRangeU [iLengthU - 1] = '\0';
szRangeU = szRangeU + 1;
BYTE uMinL, uMaxL;
if ( !GetSetMinMax ( szRangeL, uMinL, uMaxL ) )
return false;
BYTE uMinU, uMaxU;
if ( !GetSetMinMax ( szRangeU, uMinU, uMaxU ) )
return false;
if ( ( uMaxU - uMinU )!=( uMaxL - uMinL ) )
return false;
for ( BYTE i=0; i<=( uMaxL - uMinL ); ++i )
if ( IsInSet ( uMinL + i, szRangeL ) && IsInSet ( uMinU + i, szRangeU ) )
AddCharPair ( uMinL + i, uMinU + i );
} else
{
if ( iLengthL > 4 || iLengthU > 4 )
return false;
const char * szL = szRangeL;
const char * szU = szRangeU;
AddCharPair ( GetWordchar(szL), GetWordchar(szU) );
}
m_bUseDictConversion = true;
return true;
}
void CISpellAffix::AddCharPair ( BYTE uCharL, BYTE uCharU )
{
m_dCharset [uCharU] = uCharL;
}
void CISpellAffix::Strip ( char * szText )
{
char * szIterator1 = szText;
char * szIterator2 = szText;
while ( *szIterator1 )
{
if ( !isspace ( (unsigned char) *szIterator1 ) && *szIterator1!='-' )
{
*szIterator2 = *szIterator1;
++szIterator2;
}
++szIterator1;
}
*szIterator2 = '\0';
while ( *szText )
{
*szText = ToLowerCase ( *szText );
++szText;
}
}
char CISpellAffix::ToLowerCase ( char cChar )
{
if ( m_bFirstCaseConv )
{
LoadLocale ();
m_bFirstCaseConv = false;
}
// dictionary conversion
if ( m_bUseDictConversion )
return m_dCharset [(BYTE) cChar] ? m_dCharset [(BYTE) cChar] : cChar;
// user-defined character mapping
if ( m_bUseLowerCaser )
{
char cResult = (char)m_LowerCaser.ToLower ( (BYTE) cChar );
return cResult ? cResult : cChar;
}
// user-specified code page conversion
return (char)tolower ( (BYTE)cChar ); // workaround for systems (eg. FreeBSD) which default to signed char. marvelous!
}
void CISpellAffix::LoadLocale ()
{
if ( m_bUseDictConversion )
printf ( "Using dictionary-defined character set\n" );
else
if ( !m_sCharsetFile.IsEmpty () )
{
FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" );
if ( pFile )
{
printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () );
const int MAX_CHARSET_LENGTH = 4096;
char szBuffer [MAX_CHARSET_LENGTH];
char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile );
if ( szResult )
{
CSphVector<CSphRemapRange> dRemaps;
if ( sphParseCharset ( szBuffer, dRemaps ) )
{
m_bUseLowerCaser = true;
m_LowerCaser.AddRemaps ( dRemaps, 0 );
} else
{
printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() );
}
} else
{
printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() );
}
fclose ( pFile );
} else
{
printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() );
}
} else
{
if ( !m_sLocale.IsEmpty () )
{
char dLocaleC[256], dLocaleUser[256];
setlocale ( LC_ALL, "C" );
for ( int i=0; i<256; i++ )
dLocaleC[i] = (char) tolower(i);
char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() );
if ( szLocale )
{
printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() );
for ( int i=0; i<256; i++ )
dLocaleUser[i] = (char) tolower(i);
if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) )
printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" );
} else
printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() );
} else
printf ( "WARNING: no character set specified\n" );
}
}
//////////////////////////////////////////////////////////////////////////
enum OutputMode_e
{
M_DEBUG,
M_DUPLICATES,
M_LAST,
M_EXACT_OR_LONGEST,
M_DEFAULT = M_EXACT_OR_LONGEST
};
const char * dModeName[] =
{
"debug",
"duplicates",
"last"
};
struct MapInfo_t
{
CSphString m_sWord;
char m_sRules[3];
};
struct WordLess
{
inline bool IsLess ( const char * a, const char * b ) const
{
return strcoll ( a, b ) < 0;
}
};
typedef CSphOrderedHash < CSphVector<MapInfo_t>, CSphString, CSphStrHashFunc, 100000, 13 > WordMap_t;
static void EmitResult ( WordMap_t & tMap , const CSphString & sFrom, const CSphString & sTo, char cRuleA = 0, char cRuleB = 0 )
{
if ( !tMap.Exists(sFrom) )
tMap.Add ( CSphVector<MapInfo_t>(), sFrom );
MapInfo_t tInfo;
tInfo.m_sWord = sTo;
tInfo.m_sRules[0] = cRuleA;
tInfo.m_sRules[1] = cRuleB;
tInfo.m_sRules[2] = 0;
tMap[sFrom].Add ( tInfo );
}
int main ( int iArgs, char ** dArgs )
{
OutputMode_e eMode = M_DEFAULT;
bool bUseCustomCharset = false;
CSphString sDict, sAffix, sLocale, sCharsetFile, sResult = "result.txt";
printf ( "spelldump, an ispell dictionary dumper\n\n" );
int i = 1;
for ( ; i < iArgs; i++ )
{
if ( !strcmp ( dArgs[i], "-c" ) )
{
if ( ++i==iArgs ) break;
bUseCustomCharset = true;
sCharsetFile = dArgs[i];
} else if ( !strcmp ( dArgs[i], "-m" ) )
{
if ( ++i==iArgs ) break;
char * sMode = dArgs[i];
if ( !strcmp ( sMode, "debug" ) ) { eMode = M_DEBUG; continue; }
if ( !strcmp ( sMode, "duplicates" ) ) { eMode = M_DUPLICATES; continue; }
if ( !strcmp ( sMode, "last" ) ) { eMode = M_LAST; continue; }
if ( !strcmp ( sMode, "default" ) ) { eMode = M_DEFAULT; continue; }
printf ( "Unrecognized mode: %s\n", sMode );
return 1;
} else
break;
}
switch ( iArgs - i )
{
case 4:
sLocale = dArgs[i + 3];
case 3:
sResult = dArgs[i + 2];
case 2:
sAffix = dArgs[i + 1];
sDict = dArgs[i];
break;
default:
printf ( "Usage: spelldump [options] <dictionary> <affix> [result] [locale-name]\n\n"
"Options:\n"
"-c <file>\tuse case convertion defined in <file>\n"
"-m <mode>\toutput (conflict resolution) mode:\n"
"\t\tdefault - try to guess the best way to resolve a conflict\n"
"\t\tlast - choose last entry\n"
"\t\tdebug - dump all mappings (with rules)\n"
"\t\tduplicates - dump duplicate mappings only (with rules)\n" );
if ( iArgs>1 )
{
printf ( "\n"
"Examples:\n"
"spelldump en.dict en.aff\n"
"spelldump ru.dict ru.aff ru.txt ru_RU.CP1251\n"
"spelldump ru.dict ru.aff ru.txt .1251\n" );
}
return 1;
}
printf ( "Loading dictionary...\n" );
CISpellDict Dict;
if ( !Dict.Load ( sDict.cstr () ) )
sphDie ( "Error loading dictionary file '%s'\n", sDict.IsEmpty () ? "" : sDict.cstr () );
printf ( "Loading affix file...\n" );
CISpellAffix Affix ( sLocale.cstr (), bUseCustomCharset ? sCharsetFile.cstr () : NULL );
if ( !Affix.Load ( sAffix.cstr () ) )
sphDie ( "Error loading affix file '%s'\n", sAffix.IsEmpty () ? "" : sAffix.cstr () );
if ( sResult.IsEmpty () )
sphDie ( "No result file specified\n" );
FILE * pFile = fopen ( sResult.cstr (), "wt" );
if ( !pFile )
sphDie ( "Unable to open '%s' for writing\n", sResult.cstr () );
if ( eMode!=M_DEFAULT )
printf ( "Output mode: %s\n", dModeName[eMode] );
Dict.IterateStart ();
WordMap_t tWordMap;
const CISpellDict::CISpellDictWord * pWord = NULL;
int nDone = 0;
while ( ( pWord = Dict.IterateNext () )!=NULL )
{
EmitResult ( tWordMap, pWord->m_sWord, pWord->m_sWord );
if ( ( ++nDone % 10 )==0 )
{
printf ( "\rDictionary words processed: %d", nDone );
fflush ( stdout );
}
if ( pWord->m_sFlags.IsEmpty() )
continue;
CSphString sWord, sWordForCross;
int iFlagLen = strlen ( pWord->m_sFlags.cstr () );
for ( int iFlag1 = 0; iFlag1 < iFlagLen; ++iFlag1 )
for ( int iRule1 = 0; iRule1 < Affix.GetNumRules (); ++iRule1 )
{
CISpellAffixRule * pRule1 = Affix.GetRule ( iRule1 );
if ( pRule1->Flag()!=pWord->m_sFlags.cstr()[iFlag1] )
continue;
sWord = pWord->m_sWord;
if ( !pRule1->Apply ( sWord ) )
continue;
EmitResult ( tWordMap, sWord, pWord->m_sWord, pRule1->Flag() );
// apply other rules
if ( !Affix.CheckCrosses() )
continue;
if ( !pRule1->IsCrossProduct() )
continue;
for ( int iFlag2 = iFlag1 + 1; iFlag2 < iFlagLen; ++iFlag2 )
for ( int iRule2 = 0; iRule2 < Affix.GetNumRules (); ++iRule2 )
{
CISpellAffixRule * pRule2 = Affix.GetRule ( iRule2 );
if ( !pRule2->IsCrossProduct () || pRule2->Flag()!=pWord->m_sFlags.cstr()[iFlag2] ||
pRule2->IsPrefix()==pRule1->IsPrefix() )
continue;
sWordForCross = sWord;
if ( pRule2->Apply ( sWordForCross ) )
EmitResult ( tWordMap, sWordForCross, pWord->m_sWord, pRule1->Flag(), pRule2->Flag() );
}
}
}
printf ( "\rDictionary words processed: %d\n", nDone );
// output
CSphVector<const char *> dKeys;
tWordMap.IterateStart();
while ( tWordMap.IterateNext() )
dKeys.Add ( tWordMap.IterateGetKey().cstr() );
dKeys.Sort ( WordLess() );
ARRAY_FOREACH ( iKey, dKeys )
{
const CSphVector<MapInfo_t> & dWords = tWordMap[dKeys[iKey]];
const char * sKey = dKeys[iKey];
switch ( eMode )
{
case M_LAST:
fprintf ( pFile, "%s > %s\n", sKey, dWords.Last().m_sWord.cstr() );
break;
case M_EXACT_OR_LONGEST:
{
int iMatch = 0;
int iLength = 0;
ARRAY_FOREACH ( i, dWords )
{
if ( dWords[i].m_sWord==sKey )
{
iMatch = i;
break;
}
int iWordLength = strlen ( dWords[i].m_sWord.cstr() );
if ( iWordLength>iLength )
{
iLength = iWordLength;
iMatch = i;
}
}
fprintf ( pFile, "%s > %s\n", sKey, dWords[iMatch].m_sWord.cstr() );
break;
}
case M_DUPLICATES:
if ( dWords.GetLength()==1 ) break;
case M_DEBUG:
ARRAY_FOREACH ( i, dWords )
fprintf ( pFile, "%s > %s %s/%d\n", sKey, dWords[i].m_sWord.cstr(),
dWords[i].m_sRules, dWords.GetLength() );
break;
}
}
fclose ( pFile );
return 0;
}
//
// $Id$
//