#!/usr/bin/env python2.7
import os
import sys
import string
import on
import on.corpora
import on.corpora.tree
from on import ontonotes
from on.corpora import subcorpus
from on.common import util
def generate_part_list(filename):
part_list = []
infile = file(filename)
part_id = -1
start_sentence_id = 0
sentence_id = -1
while True:
line = infile.readline()
if len(line) == 0 :
break
if line.startswith("#"):
if line.startswith("#end"):
part_id = part_id + 1
str_part_id = "part" + str(part_id).zfill(3)
part_list.append((str_part_id,start_sentence_id,sentence_id))
start_sentence_id = sentence_id + 1;
elif not line.startswith("\n"):
word_index = string.atoi(line.split()[2])
if word_index == 0 :
sentence_id = sentence_id + 1
return part_list
def get_part_index(list,index):
length = len(list)
i = 0
while i < length:
if index <= list[i][2]:
return (list[i][0],index-list[i][1])
else:
i = i + 1
raise "get_part_index fail."
return ""
config = util.load_config("./myconfig")
a_ontonotes = ontonotes(config,"files")
data_in = a_ontonotes.config_opt("data_in")
outfile = file('mention','w')
i = 0
POS_need = ["NP","NNP"]
for a_subcorpus in a_ontonotes:
all_treebanks = a_subcorpus.all_banks('parse')
for a_treebank in all_treebanks:
for a_tree_document in a_treebank:
i = i + 1
print str(i)
lang = a_tree_document.language
conll_path = data_in
if lang == "ar":
conll_path = conll_path + "arabic/annotations/"
elif lang == "ch":
conll_path = conll_path + "chinese/annotations/"
POS_need.append("PN")
else:
conll_path = conll_path + "english/annotations/"
POS_need.append("PRP")
POS_need.append("PRP$")
doc_id = a_tree_document.document_id.split("@")[0]
conll_path = conll_path + doc_id + ".conll"
part_list = generate_part_list(conll_path)
for a_tree in a_tree_document:
sentence_index = a_tree.get_sentence_index()
part_index = get_part_index(part_list,sentence_index)
subtree_list = []
for a_subtree in a_tree.children:
subtree_list.append(a_subtree)
subtree_list.reverse()
while len(subtree_list) != 0 :
flag = 0
a_subtree = subtree_list.pop()
if a_subtree.tag in POS_need :
subtree_tag = a_subtree.tag
if len(a_subtree.children) == 1 and len(a_subtree.leaves()) == 1 :
for sub in a_subtree.children[0].subtrees():
if sub.tag in POS_need :
subtree_tag = subtree_tag + "|" + sub.tag
flag = 1
lens = a_subtree.__len__()
token_index = a_subtree.get_token_index()
position = doc_id + '.' + part_index[0] + "." + str(part_index[1]) + '.' + str(token_index) + '-' + str(token_index+lens-1)
strout = position+"\t"+a_subtree.get_word_string()+" *****"+subtree_tag+"\t"+"\n"
outfile.write(strout.encode("utf-8"))
if flag == 1 :
continue
child_list = []
for a_child in a_subtree.children:
child_list.append(a_child)
child_list.reverse()
subtree_list = subtree_list + child_list
del a_subtree
del child_list
del subtree_list
del a_tree
del part_list
del a_tree_document
del a_treebank
del all_treebanks
del a_subcorpus
outfile.close()
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
extractmention.zip (5个子文件)
extractmention
extract(windows)
myconfig 6KB
mention 21.34MB
extract.py 3KB
myconfig 6KB
extract.py 3KB
共 5 条
- 1
资源评论
lujin0808
- 粉丝: 7
- 资源: 18
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功