# -*- coding: utf-8 -*-
# 1、去掉特殊符号".","<",">","#";
# 2、去掉中间多余空格;
# 3、去掉首尾空格;
# 4、中文与英文间添加空格
import pandas as pd
from itertools import islice
def is_chinese(uchar):
# 判断一个unicode是否是汉字
if uchar>= u'\u4e00' and uchar<=u'\u9fa5':
return True
else:
return False
def is_number(uchar):
# 判断一个unicode是否是数字
if uchar >= u'\u0030' and uchar<=u'\u0039':
return True
else:
return False
def is_english(uchar):
# 判断一个unicode是否是英文字母
if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
return True
else:
return False
def is_erightbracket(uchar):
# 判断一个unicode是否是英文右括号
if uchar==u')':
return True
else:
return False
def is_crightbracket(uchar):
# 判断一个unicode是否是中文右括号
if uchar==u'\uff09':
return True
else:
return False
#处理主函数
def checkText(tstring):
newstring = ""
teststring = tstring.replace('<','').replace('>','').replace('.','').replace('#',' ').replace(' ', '').strip()
testlist = list(teststring)
for i in range(0,len(testlist)):
if i<len(testlist)-1:
#if (is_chinese(testlist[i]) and is_number(testlist[i+1])) or
if (is_chinese(testlist[i]) and is_english(testlist[i+1])):
testlist[i] = testlist[i]+" "
#elif (is_number(teststring[i]) and is_chinese(teststring[i+1])):
#testlist[i] = testlist[i]+" "
elif (is_number(testlist[i+1]) and is_chinese(testlist[i])):
testlist[i] = testlist[i]+" "
elif (is_english(teststring[i]) and is_chinese(teststring[i + 1])):
testlist[i] = testlist[i] + " "
#elif (is_erightbracket(teststring[i]) and is_chinese(teststring[i+1])):
#testlist[i] = testlist[i]+" "
#elif (is_crightbracket(teststring[i]) and is_number(teststring[i+1])) or (is_crightbracket(teststring[i]) and is_english(teststring[i+1])):
#testlist[i] = testlist[i]+" "
newstring = newstring+testlist[i]
return newstring
# s = 'sdjs新手oh'.decode('utf-8') # 举个栗子是字符串s,为了匹配下文的unicode形式,所以需要解码
# r=checkText(s)
# print r
#忽略表头,按行读取数据
data=[]
input_file = open('tra.csv')
for line in islice(input_file, 1, None):
#rust = checkText(line)
rust = checkText(unicode(line, "gbk"))
data.append(rust.encode('utf-8'))
print(rust)
input_file.close()
#按行保存为txt文本
# fl=open('list3.txt', 'w')
# for i in data:
# fl.write(i)
# fl.write("\n")
# fl.close()
#pd.DataFrame([[1,2,3],[4,5,6]],index=['row1','row2'],columns=['c1','c2','c3'])
df=pd.DataFrame(data,columns=['name'])
df.to_csv('11.csv',index=False,sep=",",encoding='gbk')