conll特征词提取python资源-CSDN文库

共5个文件

py：2个

myconfig：2个

mention：1个

conll

python

需积分: 50 187 浏览量 2012-02-20 09:10:13 上传评论 1 收藏 3.18MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

extractmention.zip （5个子文件）

extractmention

extract(windows)

myconfig 6KB

mention 21.34MB

extract.py 3KB

myconfig 6KB

extract.py 3KB

#!/usr/bin/env python2.7 import os import sys import string import on import on.corpora import on.corpora.tree from on import ontonotes from on.corpora import subcorpus from on.common import util def generate_part_list(filename): part_list = [] infile = file(filename) part_id = -1 start_sentence_id = 0 sentence_id = -1 while True: line = infile.readline() if len(line) == 0 : break if line.startswith("#"): if line.startswith("#end"): part_id = part_id + 1 str_part_id = "part" + str(part_id).zfill(3) part_list.append((str_part_id,start_sentence_id,sentence_id)) start_sentence_id = sentence_id + 1; elif not line.startswith("\n"): word_index = string.atoi(line.split()[2]) if word_index == 0 : sentence_id = sentence_id + 1 return part_list def get_part_index(list,index): length = len(list) i = 0 while i < length: if index <= list[i][2]: return (list[i][0],index-list[i][1]) else: i = i + 1 raise "get_part_index fail." return "" config = util.load_config("./myconfig") a_ontonotes = ontonotes(config,"files") data_in = a_ontonotes.config_opt("data_in") outfile = file('mention','w') i = 0 POS_need = ["NP","NNP"] for a_subcorpus in a_ontonotes: all_treebanks = a_subcorpus.all_banks('parse') for a_treebank in all_treebanks: for a_tree_document in a_treebank: i = i + 1 print str(i) lang = a_tree_document.language conll_path = data_in if lang == "ar": conll_path = conll_path + "arabic/annotations/" elif lang == "ch": conll_path = conll_path + "chinese/annotations/" POS_need.append("PN") else: conll_path = conll_path + "english/annotations/" POS_need.append("PRP") POS_need.append("PRP$") doc_id = a_tree_document.document_id.split("@")[0] conll_path = conll_path + doc_id + ".conll" part_list = generate_part_list(conll_path) for a_tree in a_tree_document: sentence_index = a_tree.get_sentence_index() part_index = get_part_index(part_list,sentence_index) subtree_list = [] for a_subtree in a_tree.children: subtree_list.append(a_subtree) subtree_list.reverse() while len(subtree_list) != 0 : flag = 0 a_subtree = subtree_list.pop() if a_subtree.tag in POS_need : subtree_tag = a_subtree.tag if len(a_subtree.children) == 1 and len(a_subtree.leaves()) == 1 : for sub in a_subtree.children[0].subtrees(): if sub.tag in POS_need : subtree_tag = subtree_tag + "|" + sub.tag flag = 1 lens = a_subtree.__len__() token_index = a_subtree.get_token_index() position = doc_id + '.' + part_index[0] + "." + str(part_index[1]) + '.' + str(token_index) + '-' + str(token_index+lens-1) strout = position+"\t"+a_subtree.get_word_string()+" *****"+subtree_tag+"\t"+"\n" outfile.write(strout.encode("utf-8")) if flag == 1 : continue child_list = [] for a_child in a_subtree.children: child_list.append(a_child) child_list.reverse() subtree_list = subtree_list + child_list del a_subtree del child_list del subtree_list del a_tree del part_list del a_tree_document del a_treebank del all_treebanks del a_subcorpus outfile.close()

评论收藏

内容反馈