* @package spliter * @version $Id$ * @link http://www.chanzhi.org * @see http://www.cnblogs.com/chenwenbiao/archive/2011/08/11/2134503.html * @see http://stackoverflow.com/questions/9361303/can-i-get-the-unicode-value-of-a-character-or-vise-versa-with-php */ class spliter { /** * Split a utf-8 string into words, computing unicode for every word. * * @param string $string * @access public * @return array */ public function utf8Split($string) { $string = strtolower($string); $i = 0; $length = strlen($string); $dict = array(); $words = ''; $offset = 0; while($i <= $length) { $letter = substr($string, $i, 1); $ord = ord($letter); /* The first letter is ascii, try to get a word. */ if($ord >= 0 && $ord <= 191) { $i ++; /* Stitching content in the case of words or Spaces. */ if($this->isLetter($letter)) { $word = $letter; while($i <= $length) { $letter = substr($string, $i, 1); if(!$this->isLetter($letter)) break; $word .= $letter; $i++; } /* Process intigers. */ if(is_numeric($word) and (strpos($word, '.') === false)) $word = "|" . $word . "|"; $word = str_pad(strtolower($word), 5, '_'); $words .= ' ' . $word; } elseif($ord == 32) { $words .= ' '; } continue; } if($ord >= 192 && $ord <= 223) $offset = 2; if($ord >= 224 && $ord <= 239) $offset = 3; if($ord >= 240 && $ord <= 247) $offset = 4; if($ord >= 248 && $ord <= 251) $offset = 5; if($ord >= 252 && $ord <= 253) $offset = 6; if($offset >= 2) { $letter = substr($string, $i, $offset); $unicode = $this->unicode($letter); if(strlen($unicode) == 5) { $dict[$unicode] = $letter; $words .= ' ' . $unicode; } else { /* When the current word has a corresponding number in the dictionary table, concatenate a space before it. */ if(is_numeric(substr($words, strlen($words) - 1, 1))) { $words .= ' ' . $letter; } else { $words .= $letter; } } $i += $offset; } } return array('dict' => $dict, 'words' => $words); } /** * Return unicode value for a char. * * @param string $c * @access public * @return int */ public function unicode($c) { if(ord($c[0]) >= 0 && ord($c[0]) <= 127) return ord($c[0]); if(ord($c[0]) >= 192 && ord($c[0]) <= 223) return (ord($c[0]) - 192) * 64 + (ord($c[1]) - 128); if(ord($c[0]) >= 224 && ord($c[0]) <= 239) return (ord($c[0]) - 224) * 4096 + (ord($c[1]) - 128) * 64 + (ord($c[2]) - 128); if(ord($c[0]) >= 240 && ord($c[0]) <= 247) return (ord($c[0]) - 240) * 262144 + (ord($c[1]) - 128) * 4096 + (ord($c[2]) - 128) * 64 + (ord($c[3]) - 128); if(ord($c[0]) >= 248 && ord($c[0]) <= 251) return (ord($c[0]) - 248) * 16777216 + (ord($c[1]) - 128) * 262144 + (ord($c[2]) - 128) * 4096 + (ord($c[3]) - 128) * 64 + (ord($c[4]) - 128); if(ord($c[0]) >= 252 && ord($c[0]) <= 253) return (ord($c[0]) - 252) * 1073741824 + (ord($c[1]) - 128) * 16777216 + (ord($c[2]) - 128) * 262144 + (ord($c[3]) - 128) * 4096 + (ord($c[4]) - 128) * 64 + (ord($c[5]) - 128); if(ord($c[0]) >= 254 && ord($c[0]) <= 255) return false; return false; } /** * Judge a char is Letter or not. * * @param string $letter * @access public * @return bool */ public function isLetter($letter) { $ord = ord($letter); if($ord >= ord('a') and $ord <= ord('z')) return true; if($ord >= ord('A') and $ord <= ord('Z')) return true; if($ord >= ord(0) and $ord <= ord(9)) return true; if($letter and strpos('._/->: