人工智能-项目实践-数据预处理-蒙古文语料预处理流程资源-CSDN文库

共5个文件

py：5个

版权申诉

人工智能

数据预处理

python

113 浏览量 2024-03-02 20:07:35 上传评论 1 收藏 10KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

mgw_data_processing-master.zip （5个子文件）

mgw_data_processing-master

processing.py 4KB

mongolianPretreatment.py 8KB

correctBoundary.py 2KB

convert.py 5KB

corpusPreprocessingFunctions.py 11KB

#!/usr/bin/python3 # -*- coding: utf-8 -*- # ************************** # * Author : evilbear # * Email : evilbear@live.cn # * Dependency : word2vec # * Description : Corresponding preprocessing operations, prepare the documents required for the experiment. # such as: 5 cross-validation dataset, vocabulary, word vectors table... # * create time : 23/12/2018 # * file name : corpusPreprocessingFunctions.py import os, random, math, shutil, word2vec import numpy as np from itertools import islice # Convert corpus into BIOES label. def corpus2BIO(inFilePath, outFilePath): data_file = open(inFilePath, "r") save_file = open(outFilePath, "w") tag_dict = {'GPE':'LOC', 'OGR':'ORG', 'PER':'PER'} for line in data_file: line = line.strip().split(' ') tag_list = ['O'] * len(line) state = False for idx, word in enumerate(line): if word in tag_dict and line[idx-1] == '[': tag_list[idx-1], tag_list[idx] = '', '' state = True entity_structure = [tag_dict[word]] continue if state == True: if word != ']': entity_structure.append(idx) else: tag_list[idx] = '' state = False entity = entity_structure[0] entity_left = entity_structure[1] entity_right = entity_structure[-1] entity_len = len(entity_structure[1:]) if entity_len == 1: tag_list[entity_left] = 'S-' + entity else: tag_list[entity_left : entity_right+1] = ['I-' + entity] * entity_len tag_list[entity_left] = 'B-' + entity tag_list[entity_right] = 'E-' + entity for word, tag in zip(line, tag_list): if tag: save_file.write(word + " " + tag + '\n') save_file.write('\n') data_file.close() save_file.close() # Convert label corpus into unlabel corpus. def label2unlabel(inFilePath, outFilePath): data_file = open(inFilePath, "r") save_file = open(outFilePath, "w") for line in data_file: line = line.strip('\n') if (len(line) != 0): word = line.split(' ')[0] save_file.write(word + ' ') else: save_file.write('\n') data_file.close() save_file.close() # Statistical word frequency. def statisticsWord(inFilePath, outFilePath): data_file = open(inFilePath, "r") save_file = open(outFilePath, "w") words_dict = dict() for line in data_file: line = line.strip().split(' ') for word in line: if word.isdigit(): continue if word in words_dict: words_dict[word] += 1 else: words_dict[word] = 1 data = sorted(words_dict.items(), key=lambda x:x[1], reverse=True) for i in range(len(data)): save_file.write(data[i][0]+' '+str(data[i][1])+'\n') data_file.close() save_file.close() # Combine label and unlabel, deduplication and randomly def mergingUnlabel(FilePath1, FilePath2, outFilePath): sentence_set = set() with open(FilePath1, 'r') as fr: for line in fr: sentence_set.add(line.strip()) with open(FilePath2, 'r') as fr: for line in fr: sentence_set.add(line.strip()) sentence_list = list(sentence_set) random.shuffle(sentence_list) with open(outFilePath, 'w') as fw: for line in sentence_list: fw.write(line + '\n') # Statistics vocabulary contains information def statisticsWordState(FilePath1, FilePath2, frequency1, frequency2): words1, words2 = set(), set() with open(FilePath1, "r") as fr: for line in fr: line = line.strip('\n').split(' ') if int(line[1]) >= frequency1: words1.add(line[0]) with open(FilePath2, "r") as fr: for line in fr: line = line.strip('\n').split(' ') try: if int(line[1]) >= frequency2: words2.add(line[0]) except: pass print("Words in former, not in latter. The number is {}".format(len(words1-words2))) print("Words in latter, not in former. The number is {}".format(len(words2-words1))) words1.update(words2) print("Words in former and latter. The number is {}".format(len(words1))) # Preparation vocabulary def preparationVocabulary(inFilePath, outFilePath, frequency): with open(outFilePath, "w") as fw: with open(inFilePath, "r") as fr: for line in fr: line = line.strip('\n').split(' ') if int(line[1]) >= frequency: fw.write(line[0] + '\n') else: break # Extend words to vocabulary def extendVocab(inFilePath, words_list): with open(inFilePath, 'r+') as f: content = f.read() f.seek(0, 0) add_content = '' for words in words_list: add_content += words + '\n' f.write(add_content + content) # Divide corpus into cross validation datasets def preCrossValidation(inFilePath, outFilePath, batch): corpus_list = list() with open(inFilePath, "r") as fr: sentence = '' for line in fr: line = line.strip('\n') if (len(line) != 0): sentence += line + '\n' else: if len(sentence.split('\n')) < 300: corpus_list.append(sentence + '\n') sentence = '' random.shuffle(corpus_list) batch_size = math.ceil(len(corpus_list) / batch) for idx in range(batch): file_name = 'part' + str(idx) + '.txt' save_path = os.path.join(outFilePath, file_name) pointer = idx * batch_size with open(save_path, 'w') as fw: for line in corpus_list[pointer: pointer+batch_size]: fw.write(line) # Delete the last line of extra '\n' def delLastLine(inFilePath): if os.path.isfile(inFilePath): with open(inFilePath, 'rb+') as f: f.seek(-1, os.SEEK_END) f.truncate() elif os.path.isdir(inFilePath): file_list = os.listdir(inFilePath) for file_name in file_list: path = os.path.join(inFilePath, file_name) with open(path, 'rb+') as f: f.seek(-1, os.SEEK_END) f.truncate() else: pass # Train vectors through Glove and Word2vec def trainVectors(inFilePath, data_path, glove_vectors_path, word2vec_vectors_path, dimension=300, min_count=3): # Glove, parameters need to be modified in demo.sh command = "cd {0}; sh demo.sh".format(inFilePath) os.system(command) os.remove(os.path.join(inFilePath, 'cooccurrence.bin')) os.remove(os.path.join(inFilePath, 'cooccurrence.shuf.bin')) os.remove(os.path.join(inFilePath, 'vectors.bin')) os.remove(os.path.join(inFilePath, 'vocab.txt')) shutil.move(os.path.join(inFilePath, 'vectors.txt'), glove_vectors_path) # Word2vec word2vec.word2vec(data_path, word2vec_vectors_path, size=dimension, verbose=True, binary=0, min_count=min_count) # Prepare vectors table corresponding to the vocabulary def prepareNPZ(FilePathVocab, FilePathVectors, outFilePath, dimension, skipLine=None): vocab_dict = dict() with open(FilePathVocab, 'r') as fr: for idx, word in enumerate(fr): word = word.strip('\n') vocab_dict[word] = idx embeddings = np.random.randn(len(vocab_dict), dimension) with open(FilePathVectors, 'r') as fr: for line in islice(fr, skipLine, None): line = line.strip('\n').split(' ') word = line[0] embedding = [float(x) for x in line[1 : dimension+1]] if wor

评论收藏

内容反馈

版权申诉