<?php
/*
我是个有强迫症的人, 我见不得 银行的行和行人的行 都念 xing,
所以这里这个函数的作用就是把多音字区分开来, 但是有个缺陷就是 数据为库只是微软的Access,我试过mysql,sqlite,都不能将中文排序.
如若有懂的,请不吝赐教,发一份给 hiyee@qq.com
用法 Pinyin::getPinyin(str, mode, case, noise);
mode: 1:全拼 2:首字母
case: 1:全小写 2:首字母大写 3:全大写
noise: 1:清除非英文字符
*/
class Pinyin{
public static function getPinyin($str, $mode=1, $case=2, $noise=0) {
$PinYin = "";
if (!empty($str)) {
$str=self::SBC2DBC($str); #过滤常用全角符号为半角
$file = 'pinyin.lib'; //数据库文件 access mdb文件, 它里面中文可以排序; 因为不懂php, 所以只能用 odbc_connect 了, 有懂的请发份给 hiyee@qq.com, 甚谢.
if(!file_exists($file))throw_exception(language('SYSTEM:file.not.exists', array('pinyin')));
$db = @odbc_connect("DRIVER={Microsoft Access Driver (*.mdb)}; DBQ=" . realpath("$file"),"","",SQL_CUR_USE_ODBC);
if(!$db)throw_exception(language('SYSTEM:function.not.exists', array($e->getMessage())));
$hzLen = self::str_len($str);
for($i=0; $i<$hzLen; $i++) {
$py = ""; #拼音复位
$hanzi = self::str_sub($str, $i, 1); #单字
$hzAsc = self::ascw($hanzi); #单字Unicode码位
if($hzAsc < 0)$hzAsc += 65536; #单字Unicode码位补码
// if($hzAsc >= 65281 && $hzAsc <= 65374)$hzAsc -= 65248; #全角转半角字符
// if($hzAsc == 12288)$hzAsc = 32; #全角转半角空格
// if($hzAsc == 12290)$hzAsc = 46; #全角转半角空格
#多音字
$dyz=array(20102,30528,20869,38463,25303,25170,34444,22561,36767,25153,20415,39584,21093,27850,34180,21340,21442,34255,24046,31109,39076,22066,31216,28548,21273,33261,20256,32496,25774,22823,24377,24471,30340,35843,37117,24230,22244,24694,32473,40863,21644,35977,36824,20250,31293,38477,35282,20389,21119,35299,34249,21170,39048,36228,22204,21345,22771,21549,28518,28889,20048,21202,20457,20603,38706,25419,33853,22475,33033,34067,27667,27852,31192,32554,27169,25705,23068,24324,21032,23631,36843,26420,28689,26333,26646,36426,22855,33640,24378,33540,20146,38592,22622,30465,20160,35782,35828,20282,20284,23487,25552,25299,31995,21523,21414,32420,24055,21066,26657,34892,30044,21693,27575,21505,25874,25321,26366,25166,36711,31896,25240,37325,23646,24162);
if(in_array($hzAsc,$dyz)) {
if ($hzLen == 1){
# 单字多音字 = 默认拼音
}else{
#多音字存在字串中
switch($i) {
case 1 :
# 第一字为多音字
$nhz=self::str_sub($str, $i+1, 1);
$sql = "SELECT TOP 1 Pinyin FROM DuoYinZi WHERE Word = '$hanzi' AND RWords LIKE '%{$nhz}%'";
break;
case $hzLen :
# 最后一字为多音字
$phz=self::str_sub($str, $i-1, 1);
$sql = "SELECT TOP 1 Pinyin FROM DuoYinZi WHERE Word = '$hanzi' AND LWords LIKE '%{$phz}%'";
break;
default :
# 中间多音字
$phz=self::str_sub($str, $i-1, 1);
$nhz=self::str_sub($str, $i+1, 1);
$sql = "SELECT TOP 1 Pinyin FROM DuoYinZi WHERE Word = '$hanzi' AND (LWords LIKE '%{$phz}%' OR RWords LIKE '%{$nhz}%')";
}
#多音字拼音查询
$rs=odbc_exec($db, iconv("UTF-8", 'GBK', $sql));
if(odbc_fetch_row($rs))$py=odbc_result($rs,1);unset($rs);
}
#未辨别多音字拼音
if ($py == ""){
#特殊多音字拼音
switch($hzAsc) {
case 20102 :
$py = "le";break; #了
case 20869 :
$py = "nei";break;#内
case 30528 :
$py = "zhe";break;#着
default :
#多音字默认拼音查询
$sql = "SELECT TOP 1 PinYin FROM PinYin WHERE Word >= '$hanzi'";
$rs=odbc_exec($db, iconv("UTF-8", 'GBK', $sql));
if(odbc_fetch_row($rs))$py=odbc_result($rs,1);unset($rs);
}
}
}
else{
#字转拼音
if ($hzAsc >= 32 && $hzAsc <= 126){
#排除"*/:<>?\|
$py=(in_array($hzAsc,array(34, 42, 47, 58, 60, 62, 63, 92, 124)))?'-':self::chrw($hzAsc);
}elseif($hzAsc >= 19968 && $hzAsc <= 40869){
# 汉字33367个
$sql = "SELECT TOP 1 Pinyin FROM PinYin WHERE Word >= '$hanzi'";
$rs=odbc_exec($db, iconv("UTF-8", 'GBK', $sql));
if(odbc_fetch_row($rs))$py=odbc_result($rs,1);unset($rs);
}
else{
$py = $hanzi; #其他标点字符
}
}
if($mode==1)$py=substr($py,0,1);
if($case==2)$py=ucfirst($py);elseif($case==3)$py=strtoupper($py);else $py=strtolower($py);
$PinYin.=$py;
}
if($noise)$PinYin=preg_replace('/[^a-zA-Z]/i', '', $PinYin);
odbc_close($db);unset($db);unset($rs);
}
return $PinYin;
}
private function chrw($code) {
$str = ($code < 256) ? chr(0) . chr($code) : chr($code / 256) . chr($code % 256);
return iconv('UCS-2', "UTF-8", $str);
}
private function ascw($word) {
$arr = str_split($word);$bin_str = '';
foreach ($arr as $value)$bin_str .= decbin(ord($value));
$bin_str = preg_replace('/^.{4}(.{4}).{2}(.{6}).{2}(.{6})$/','$1$2$3', $bin_str);
return bindec($bin_str);
}
private function str_len($str) {
$i = 0;
$count = 0;
$len = strlen ($str);
while ($i < $len) {
$chr = ord ($str[$i]);
$count++;
$i++;
if($i >= $len) break;
if($chr & 0x80) {
$chr <<= 1;
while ($chr & 0x80) {
$i++;
$chr <<= 1;
}
}
}
return $count;
}
private function str_sub($str, $start=0, $length, $charset="utf-8"){
if(function_exists("mb_substr")){
if(mb_strlen($str, $charset) <= $length)return $str;
$slice = mb_substr($str, $start, $length, $charset);
}
else{
$re['utf-8'] = "/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xff][\x80-\xbf]{3}/";
$re['gb2312'] = "/[\x01-\x7f]|[\xb0-\xf7][\xa0-\xfe]/";
$re['gbk'] = "/[\x01-\x7f]|[\x81-\xfe][\x40-\xfe]/";
$re['big5'] = "/[\x01-\x7f]|[\x81-\xfe]([\x40-\x7e]|\xa1-\xfe])/";
preg_match_all($re[$charset], $str, $match);
if(count($match[0]) <= $length)return $str;
$slice = join("",array_slice($match[0], $start, $length));
}
return $slice;
}
private function SBC2DBC($str) {
$DBC = Array('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','-',' ',':','.',',','/','%','#','!','@','&','(',')','<','>','"',''','?','[',']','{','}','\','|','+','=','_','^','¥',' ̄','`','。');
$SBC = Array('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','-',' ',':','.',',','/','%','#','!','@','&','(',')','<','>','"','\'','?','[',']','{','}','\\','|','+','=','_','^','$','~','`','.');
return str_replace($DBC, $SBC, $str); // 全角到半角
}
}