#!/usr/bin/python3
# -*- coding: utf-8 -*-
# **************************
# * Author : evilbear
# * Email : evilbear@live.cn
# * Dependency : word2vec
# * Description : Corresponding preprocessing operations, prepare the documents required for the experiment.
# such as: 5 cross-validation dataset, vocabulary, word vectors table...
# * create time : 23/12/2018
# * file name : corpusPreprocessingFunctions.py
import os, random, math, shutil, word2vec
import numpy as np
from itertools import islice
# Convert corpus into BIOES label.
def corpus2BIO(inFilePath, outFilePath):
data_file = open(inFilePath, "r")
save_file = open(outFilePath, "w")
tag_dict = {'GPE':'LOC', 'OGR':'ORG', 'PER':'PER'}
for line in data_file:
line = line.strip().split(' ')
tag_list = ['O'] * len(line)
state = False
for idx, word in enumerate(line):
if word in tag_dict and line[idx-1] == '[':
tag_list[idx-1], tag_list[idx] = '', ''
state = True
entity_structure = [tag_dict[word]]
continue
if state == True:
if word != ']':
entity_structure.append(idx)
else:
tag_list[idx] = ''
state = False
entity = entity_structure[0]
entity_left = entity_structure[1]
entity_right = entity_structure[-1]
entity_len = len(entity_structure[1:])
if entity_len == 1:
tag_list[entity_left] = 'S-' + entity
else:
tag_list[entity_left : entity_right+1] = ['I-' + entity] * entity_len
tag_list[entity_left] = 'B-' + entity
tag_list[entity_right] = 'E-' + entity
for word, tag in zip(line, tag_list):
if tag:
save_file.write(word + " " + tag + '\n')
save_file.write('\n')
data_file.close()
save_file.close()
# Convert label corpus into unlabel corpus.
def label2unlabel(inFilePath, outFilePath):
data_file = open(inFilePath, "r")
save_file = open(outFilePath, "w")
for line in data_file:
line = line.strip('\n')
if (len(line) != 0):
word = line.split(' ')[0]
save_file.write(word + ' ')
else:
save_file.write('\n')
data_file.close()
save_file.close()
# Statistical word frequency.
def statisticsWord(inFilePath, outFilePath):
data_file = open(inFilePath, "r")
save_file = open(outFilePath, "w")
words_dict = dict()
for line in data_file:
line = line.strip().split(' ')
for word in line:
if word.isdigit():
continue
if word in words_dict:
words_dict[word] += 1
else:
words_dict[word] = 1
data = sorted(words_dict.items(), key=lambda x:x[1], reverse=True)
for i in range(len(data)):
save_file.write(data[i][0]+' '+str(data[i][1])+'\n')
data_file.close()
save_file.close()
# Combine label and unlabel, deduplication and randomly
def mergingUnlabel(FilePath1, FilePath2, outFilePath):
sentence_set = set()
with open(FilePath1, 'r') as fr:
for line in fr:
sentence_set.add(line.strip())
with open(FilePath2, 'r') as fr:
for line in fr:
sentence_set.add(line.strip())
sentence_list = list(sentence_set)
random.shuffle(sentence_list)
with open(outFilePath, 'w') as fw:
for line in sentence_list:
fw.write(line + '\n')
# Statistics vocabulary contains information
def statisticsWordState(FilePath1, FilePath2, frequency1, frequency2):
words1, words2 = set(), set()
with open(FilePath1, "r") as fr:
for line in fr:
line = line.strip('\n').split(' ')
if int(line[1]) >= frequency1:
words1.add(line[0])
with open(FilePath2, "r") as fr:
for line in fr:
line = line.strip('\n').split(' ')
try:
if int(line[1]) >= frequency2:
words2.add(line[0])
except:
pass
print("Words in former, not in latter. The number is {}".format(len(words1-words2)))
print("Words in latter, not in former. The number is {}".format(len(words2-words1)))
words1.update(words2)
print("Words in former and latter. The number is {}".format(len(words1)))
# Preparation vocabulary
def preparationVocabulary(inFilePath, outFilePath, frequency):
with open(outFilePath, "w") as fw:
with open(inFilePath, "r") as fr:
for line in fr:
line = line.strip('\n').split(' ')
if int(line[1]) >= frequency:
fw.write(line[0] + '\n')
else:
break
# Extend words to vocabulary
def extendVocab(inFilePath, words_list):
with open(inFilePath, 'r+') as f:
content = f.read()
f.seek(0, 0)
add_content = ''
for words in words_list:
add_content += words + '\n'
f.write(add_content + content)
# Divide corpus into cross validation datasets
def preCrossValidation(inFilePath, outFilePath, batch):
corpus_list = list()
with open(inFilePath, "r") as fr:
sentence = ''
for line in fr:
line = line.strip('\n')
if (len(line) != 0):
sentence += line + '\n'
else:
if len(sentence.split('\n')) < 300:
corpus_list.append(sentence + '\n')
sentence = ''
random.shuffle(corpus_list)
batch_size = math.ceil(len(corpus_list) / batch)
for idx in range(batch):
file_name = 'part' + str(idx) + '.txt'
save_path = os.path.join(outFilePath, file_name)
pointer = idx * batch_size
with open(save_path, 'w') as fw:
for line in corpus_list[pointer: pointer+batch_size]:
fw.write(line)
# Delete the last line of extra '\n'
def delLastLine(inFilePath):
if os.path.isfile(inFilePath):
with open(inFilePath, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
elif os.path.isdir(inFilePath):
file_list = os.listdir(inFilePath)
for file_name in file_list:
path = os.path.join(inFilePath, file_name)
with open(path, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
else:
pass
# Train vectors through Glove and Word2vec
def trainVectors(inFilePath, data_path, glove_vectors_path, word2vec_vectors_path, dimension=300, min_count=3):
# Glove, parameters need to be modified in demo.sh
command = "cd {0}; sh demo.sh".format(inFilePath)
os.system(command)
os.remove(os.path.join(inFilePath, 'cooccurrence.bin'))
os.remove(os.path.join(inFilePath, 'cooccurrence.shuf.bin'))
os.remove(os.path.join(inFilePath, 'vectors.bin'))
os.remove(os.path.join(inFilePath, 'vocab.txt'))
shutil.move(os.path.join(inFilePath, 'vectors.txt'), glove_vectors_path)
# Word2vec
word2vec.word2vec(data_path, word2vec_vectors_path, size=dimension, verbose=True, binary=0, min_count=min_count)
# Prepare vectors table corresponding to the vocabulary
def prepareNPZ(FilePathVocab, FilePathVectors, outFilePath, dimension, skipLine=None):
vocab_dict = dict()
with open(FilePathVocab, 'r') as fr:
for idx, word in enumerate(fr):
word = word.strip('\n')
vocab_dict[word] = idx
embeddings = np.random.randn(len(vocab_dict), dimension)
with open(FilePathVectors, 'r') as fr:
for line in islice(fr, skipLine, None):
line = line.strip('\n').split(' ')
word = line[0]
embedding = [float(x) for x in line[1 : dimension+1]]
if wor
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
蒙古文语料预处理流程 校正人工标注实体边界误差 python correctBoundary.py 和一些人工校对 对标注和无标注语料进行预处理 python processing.py --inFilePath="allmergetxt-org-GB-Correction.txt" --outFilePath="data.txt" --deduplication=True --menk2unicode=False --split202F=True 因在人工标注后后进行蒙古文校正,出现如“[ PERᠳᠦ”这种情况,使用正则匹配“[\s(GPE|OGR|PER)[^\s]+”出现85次,人工分隔 python processing.py --inFilePath="data_none.txt" --outFilePath="unlabel.txt" --deduplication=True --menk2unicode=False --split202F=True 语料的相应预处理操作,准备实验所需文件,如:5折交叉验证数据集,词表,词向量表等 python corpusPrepr
资源推荐
资源详情
资源评论
收起资源包目录
mgw_data_processing-master.zip (5个子文件)
mgw_data_processing-master
processing.py 4KB
mongolianPretreatment.py 8KB
correctBoundary.py 2KB
convert.py 5KB
corpusPreprocessingFunctions.py 11KB
共 5 条
- 1
资源评论
博士僧小星
- 粉丝: 1698
- 资源: 5876
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功