import re
import os
import sys
class SEG(object):
def __init__(self):
_localDir=os.path.dirname(__file__)
_curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir))
curpath=_curpath
self.d = {}
print >> sys.stderr,"loading dict..."
self.set([x.rstrip() for x in file(os.path.join(curpath,"main.dic")) ])
self.specialwords= set([x.rstrip().decode('utf-8') for x in file(os.path.join(curpath,"suffix.dic"))])
print >> sys.stderr,'dict ok.'
#set dictionary(a list)
def set(self,keywords):
p = self.d
q = {}
k = ''
for word in keywords:
word = (chr(11)+word).decode('utf-8')
if len(word)>5:
continue
p = self.d
ln = len(word)
for i in xrange(ln-1,-1,-1):
char = word[i].lower()
if p=='':
q[k] = {}
p = q[k]
if not (char in p):
p[char] = ''
q = p
k = char
p = p[char]
pass
def _binary_seg(self,s):
ln = len(s)
if ln==1:
return [s]
R = []
for i in xrange(ln,1,-1):
tmp = s[i-2:i]
R.append(tmp)
return R
def _pro_unreg(self,piece):
#print piece
R = []
tmp = re.sub(u"。|,|,|!|…|!|《|》|<|>|\"|'|:|:|?|\?|、|\||“|”|‘|’|;|—|(|)|·|\(|\)| "," ",piece).split()
ln1 = len(tmp)
for i in xrange(len(tmp)-1,-1,-1):
mc = re.split(r"([0-9A-Za-z\-\+#@_\.]+)",tmp[i])
for j in xrange(len(mc)-1,-1,-1):
r = mc[j]
if re.search(r"([0-9A-Za-z\-\+#@_\.]+)",r)!=None:
R.append(r)
else:
R.extend(self._binary_seg(r))
return R
def cut(self,text):
"""
"""
text = text.decode('utf-8','ignore')
p = self.d
ln = len(text)
i = ln
j = 0
z = ln
q = 0
recognised = []
mem = None
mem2 = None
while i-j>0:
t = text[i-j-1].lower()
#print i,j,t,mem
if not (t in p):
if (mem!=None) or (mem2!=None):
if mem!=None:
i,j,z = mem
mem = None
elif mem2!=None:
delta = mem2[0]-i
if delta>=1:
if (delta<5) and (re.search(ur"[\w\u2E80-\u9FFF]",t)!=None):
pre = text[i-j]
#print pre
if not (pre in self.specialwords):
i,j,z,q = mem2
del recognised[q:]
mem2 = None
p = self.d
if((i<ln) and (i<z)):
unreg_tmp = self._pro_unreg(text[i:z])
recognised.extend(unreg_tmp)
recognised.append(text[i-j:i])
#print text[i-j:i],mem2
i = i-j
z = i
j = 0
continue
j = 0
i -= 1
p = self.d
continue
p = p[t]
j+=1
if chr(11) in p:
if j<=2:
mem = i,j,z
#print text[i-1]
if (z-i<2) and (text[i-1] in self.specialwords) and ((mem2==None) or ((mem2!=None and mem2[0]-i>1))):
#print text[i-1]
mem = None
mem2 = i,j,z,len(recognised)
p = self.d
i -= 1
j = 0
continue
#print mem
p = self.d
#print i,j,z,text[i:z]
if((i<ln) and (i<z)):
unreg_tmp = self._pro_unreg(text[i:z])
recognised.extend(unreg_tmp)
recognised.append(text[i-j:i])
i = i-j
z = i
j = 0
mem = None
mem2 = None
#print mem
if mem!=None:
i,j,z = mem
recognised.extend(self._pro_unreg(text[i:z]))
recognised.append(text[i-j:i])
else:
recognised.extend(self._pro_unreg(text[i-j:z]))
return recognised