138 lines
4.9 KiB
PHP
138 lines
4.9 KiB
PHP
<?php
|
|
/**
|
|
* The spliter class file of chanzhiEPS.
|
|
*
|
|
* @copyright Copyright 2013-2014 青岛息壤网络信息有限公司 (QingDao XiRang Network Infomation Co,LTD www.xirangit.com)
|
|
* @license LGPL
|
|
* @author Xiying Guan <guanxiying@xirangit.com>
|
|
* @package spliter
|
|
* @version $Id$
|
|
* @link http://www.chanzhi.org
|
|
* @see http://www.cnblogs.com/chenwenbiao/archive/2011/08/11/2134503.html
|
|
* @see http://stackoverflow.com/questions/9361303/can-i-get-the-unicode-value-of-a-character-or-vise-versa-with-php
|
|
*/
|
|
class spliter
|
|
{
|
|
/**
|
|
* Split a utf-8 string into words, computing unicode for every word.
|
|
*
|
|
* @param string $string
|
|
* @access public
|
|
* @return array
|
|
*/
|
|
public function utf8Split($string)
|
|
{
|
|
$string = strtolower($string);
|
|
$i = 0;
|
|
$length = strlen($string);
|
|
$dict = array();
|
|
$words = '';
|
|
$offset = 0;
|
|
|
|
while($i <= $length)
|
|
{
|
|
$letter = substr($string, $i, 1);
|
|
$ord = ord($letter);
|
|
|
|
/* The first letter is ascii, try to get a word. */
|
|
if($ord >= 0 && $ord <= 191)
|
|
{
|
|
$i ++;
|
|
|
|
/* Stitching content in the case of words or Spaces. */
|
|
if($this->isLetter($letter))
|
|
{
|
|
$word = $letter;
|
|
while($i <= $length)
|
|
{
|
|
$letter = substr($string, $i, 1);
|
|
if(!$this->isLetter($letter)) break;
|
|
|
|
$word .= $letter;
|
|
$i++;
|
|
}
|
|
|
|
/* Process intigers. */
|
|
if(is_numeric($word) and (strpos($word, '.') === false)) $word = "|" . $word . "|";
|
|
$word = str_pad(strtolower($word), 5, '_');
|
|
$words .= ' ' . $word;
|
|
}
|
|
elseif($ord == 32)
|
|
{
|
|
$words .= ' ';
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if($ord >= 192 && $ord <= 223) $offset = 2;
|
|
if($ord >= 224 && $ord <= 239) $offset = 3;
|
|
if($ord >= 240 && $ord <= 247) $offset = 4;
|
|
if($ord >= 248 && $ord <= 251) $offset = 5;
|
|
if($ord >= 252 && $ord <= 253) $offset = 6;
|
|
|
|
if($offset >= 2)
|
|
{
|
|
$letter = substr($string, $i, $offset);
|
|
$unicode = $this->unicode($letter);
|
|
if(strlen($unicode) == 5)
|
|
{
|
|
$dict[$unicode] = $letter;
|
|
$words .= ' ' . $unicode;
|
|
}
|
|
else
|
|
{
|
|
/* When the current word has a corresponding number in the dictionary table, concatenate a space before it. */
|
|
if(is_numeric(substr($words, strlen($words) - 1, 1)))
|
|
{
|
|
$words .= ' ' . $letter;
|
|
}
|
|
else
|
|
{
|
|
$words .= $letter;
|
|
}
|
|
}
|
|
$i += $offset;
|
|
}
|
|
}
|
|
|
|
return array('dict' => $dict, 'words' => $words);
|
|
}
|
|
|
|
/**
|
|
* Return unicode value for a char.
|
|
*
|
|
* @param string $c
|
|
* @access public
|
|
* @return int
|
|
*/
|
|
public function unicode($c)
|
|
{
|
|
if(ord($c[0]) >= 0 && ord($c[0]) <= 127) return ord($c[0]);
|
|
if(ord($c[0]) >= 192 && ord($c[0]) <= 223) return (ord($c[0]) - 192) * 64 + (ord($c[1]) - 128);
|
|
if(ord($c[0]) >= 224 && ord($c[0]) <= 239) return (ord($c[0]) - 224) * 4096 + (ord($c[1]) - 128) * 64 + (ord($c[2]) - 128);
|
|
if(ord($c[0]) >= 240 && ord($c[0]) <= 247) return (ord($c[0]) - 240) * 262144 + (ord($c[1]) - 128) * 4096 + (ord($c[2]) - 128) * 64 + (ord($c[3]) - 128);
|
|
if(ord($c[0]) >= 248 && ord($c[0]) <= 251) return (ord($c[0]) - 248) * 16777216 + (ord($c[1]) - 128) * 262144 + (ord($c[2]) - 128) * 4096 + (ord($c[3]) - 128) * 64 + (ord($c[4]) - 128);
|
|
if(ord($c[0]) >= 252 && ord($c[0]) <= 253) return (ord($c[0]) - 252) * 1073741824 + (ord($c[1]) - 128) * 16777216 + (ord($c[2]) - 128) * 262144 + (ord($c[3]) - 128) * 4096 + (ord($c[4]) - 128) * 64 + (ord($c[5]) - 128);
|
|
|
|
if(ord($c[0]) >= 254 && ord($c[0]) <= 255) return false;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Judge a char is Letter or not.
|
|
*
|
|
* @param string $letter
|
|
* @access public
|
|
* @return bool
|
|
*/
|
|
public function isLetter($letter)
|
|
{
|
|
$ord = ord($letter);
|
|
if($ord >= ord('a') and $ord <= ord('z')) return true;
|
|
if($ord >= ord('A') and $ord <= ord('Z')) return true;
|
|
if($ord >= ord(0) and $ord <= ord(9)) return true;
|
|
if($letter and strpos('._/->:<?&', $letter) !== false) return true;
|
|
return false;
|
|
}
|
|
}
|