# encoding=utf8
import jieba.posseg as poseg
import pandas as pd
from collections import OrderedDict
from collections import defaultdict
from utils.prepare_utils import JiebaTuning as JT
from utils.prepare_utils import GeneralAddressing as GA
import json
import cPickle
import numpy as np
import re
from ipdb import set_trace as st
JOIN = '/'
CLRF = '\n'
class XHLanguageTemplate(object):
_exam_areas = (
u'膀胱',
u'宫体/子宫',
u'宫颈',
u'左附件',
u'右附件',
u'直肠',
u'盆腔骨',
u'盆腔肌肉',
u'直肠膀胱三角',
u'盆腔淋巴结',
u'盆腔积液',
u'阴道',
u'外阴',
u'宫旁',
u'宫腔积液',
u'其他'
)
_deny_word = (u'不伴', u'未见', u'未', u'无', u'不', u'未见异常',)
_ensure_word = (u'可见', u'伴', u'为', u'呈')
@classmethod
def one(cls, topic_content):
words = ('T1', 'T2')
ret = []
_des_pattern = ('n+v', 'v+n', 'u+n', 'd+l', 'a+n', 'n+ns', 'f+v+a', 'n+a', 'd+n', 'n+n+v+a+v',)
_des_pattern_split = '+'
_degree_pos = ('u', 'a', 'm')
for index, tc in enumerate(topic_content):
if tc[0] in words and index > 0:
pre = []
for i in range(index - 1, -1, -1):
if topic_content[i][1] == 'f':
pre.append(topic_content[i][0])
for j in range(i - 1, -1, -1):
if topic_content[j][1] == 'f':
pre.append(topic_content[j][0])
elif topic_content[j][1] == 'x':
break
break
degree = []
if topic_content[index - 1][1] in _degree_pos:
degree.append(topic_content[index - 1][0])
degree = '_'.join(degree)
des = []
i = index + 1
while i < len(topic_content) - 1:
if topic_content[i][1] == 'x' or topic_content[i][0] in words:
if degree != '' or len(des) != 0:
break
not_match = True
for pattern in _des_pattern:
items = pattern.split(_des_pattern_split)
if i + len(items) > len(topic_content):
continue
_real_pattern = _des_pattern_split.join([topic_content[l][1] for l in range(i, i + len(items))])
if _real_pattern == pattern:
des.append(''.join([topic_content[l][0] for l in range(i, i + len(items))]))
i += len(items)
not_match = False
break
if not_match:
i += 1
else:
break
if len(degree) > 0:
if len(des) > 0:
des.insert(0, degree)
else:
des.append(degree)
neg_or_pos = u'有'
for i in range(index - 1, -1, -1):
if topic_content[i][1] == 'x' and i != (index - 1):
break
if topic_content[i][0] in cls._deny_word:
neg_or_pos = u'无'
break
tmp_info = {}
tmp_info['检查子区域'] = '_'.join(reversed(pre)).encode('utf-8')
tmp_info['指标'] = tc[0].encode('utf-8')
tmp_info['断言'] = neg_or_pos.encode('utf-8')
tmp_info['描述'] = '_'.join(des).encode('utf-8')
ret.append(tmp_info)
return ret
@classmethod
def two(cls, topic_content):
words = (u'结合带',)
ret = []
for index, tc in enumerate(topic_content):
if tc[0] in words:
_tmp = []
for i in range(index + 1, len(topic_content)):
if 'x' not in topic_content[i][1]:
_tmp.append(topic_content[i][0])
else:
break
tmp_info = {}
tmp_info['指标'] = tc[0].encode('utf-8')
tmp_info['结果'] = (''.join(_tmp)).encode('utf-8')
ret.append(tmp_info)
return ret
@classmethod
def three(cls, topic_content):
words = (u'盆腔积液',)
ret = []
for index, tc in enumerate(topic_content):
if tc[0] in words:
has_or_not = '有'
# nearest after
for i in range(index + 1, len(topic_content)):
if topic_content[i][1] == 'x' and i > index + 1:
break
if topic_content[i][0] in cls._deny_word:
has_or_not = '无'
break
# nearest before
for i in range(index, -1, -1):
if topic_content[i][1] == 'x':
break
if topic_content[i][0] in cls._deny_word:
has_or_not = '无'
break
tmp_info = {}
tmp_info['指标'] = tc[0].encode('utf-8')
tmp_info['断言'] = has_or_not
ret.append(tmp_info)
break
return ret
@classmethod
def four(cls, topic_content):
words = (u'DWI',)
ret = []
for index, tc in enumerate(topic_content):
if tc[0] in words:
if_deny = '有'
high_or_low = '高信号'
location = []
for i in range(index - 1, -1, -1):
if topic_content[i][0] in cls._deny_word:
if_deny = '无'
if topic_content[i][1] == 'x':
break
for i in range(index + 1, len(topic_content)):
if topic_content[i][1] == 'x':
break
if topic_content[i][0] in cls._deny_word:
if topic_content[i][1] == 'd' and i < len(topic_content) - 1 and topic_content[i + 1][0] in (u'均匀', ):
continue
if_deny = '无'
if topic_content[i][0] == u'低信号':
high_or_low = '低信号'
tmp_info = {}
tmp_info['检查子区域'] = '_'.join(reversed(location)).encode('utf-8')
tmp_info['指标'] = tc[0].encode('utf-8')
tmp_info['断言'] = if_deny
tmp_info['描述'] = high_or_low
ret.append(tmp_info)
return ret
class XHPipeline(object):
_stop_words = [
u',', u':', u';', u'.', u'', u'(', u')', u'×', u'"', u'-', u'*',
u'并', u'的', u'于', u'者', u'在', u'另', u'与', u'一', u'其', u'有',
u'约', u'呈', u'信号', u'略', u'或', u'所示', u'其它', u'等', u'稍',
u'达', u'区', u'为', u'位', u'范围', u'扫描', u'段', u'显示', u'位于',
u'直径约', u'序列', u'程度', u'性', u'扫', u'行', u'走', u'值', u'以',
u'显', u'至', u'受', u'分别', u'局限', u'子', u'带', u'线', u'最',
u'为主', u'缘', u'中', u'区域', u'数量', u'水平', u'灶', u'平', u'处',
u'皆', u'样', u'及'
]
def __init__(self,
input_path, columns, output_path,
usr_dict_path, usr_suggest_path,
confirm_word_path,
hitting_word_path,
release_word_path,
label_data_path,
bow_data_path,