import time
import TC.labels as labels
import os
import math
def documentfrequency():
prepath=r"../trainTemp/"
featureTable=[]
for label in labels.labels:
wordDocumentFrequency = dict();
for fileIndex in range(10, 1900):
path = prepath + label.value + "/" + str(fileIndex) + ".txt"
if os.path.exists(path):
inputfile=open(path,encoding="utf-8",errors="ignore")
textSet=set()
for line in inputfile.readlines():
line=line.strip('\n')
textSet.add(line)
for it in textSet:
if it in wordDocumentFrequency.keys():
wordDocumentFrequency[it]+=1
else:
wordDocumentFrequency[it]=1
for k,v in wordDocumentFrequency.items():
if v>107 :
if k not in featureTable:
featureTable.append(k)
return featureTable
def mutualInformation():
prepath = r"../trainTemp/"
featureTable = []
documentNumInClass=dict()# stastic the number of document in every class
wordFrequentInClass = dict() # stastic the number of document containing word i in class
wordDocumentNum=dict() # stastic the number of document containing word i
for label in labels.labels:
documentNumInClass[label.name]=0
wordFrequent = dict()
for fileIndex in range(10, 1900):
path = prepath + label.value + "/" + str(fileIndex) + ".txt"
if os.path.exists(path):
inputfile=open(path,encoding="utf-8",errors="ignore")
documentNumInClass[label.name]+=1
wordSet=set()
for line in inputfile.readlines():
line = line.strip('\n')
wordSet.add(line)
for w in wordSet:
if w in wordFrequent.keys():
wordFrequent[w]+=1
else:
wordFrequent[w]=0
if w in wordDocumentNum:
wordDocumentNum[w]+=1
else:
wordDocumentNum[w]=0
wordFrequentInClass[label.name]=wordFrequent
#compute mutual information
N=0
for k,v in documentNumInClass.items():
N+=v
for k,v in wordFrequentInClass.items():
wordMutualInformation=dict()
N1dot=documentNumInClass[k]
N0dot=N-N1dot
for k1,v1 in v.items():
N11=v1
if k1 in wordDocumentNum:
Ndot1=wordDocumentNum[k1]
else:
Ndot1=1
Ndot0=N-Ndot1
N01=Ndot1-N11
N10=N1dot-N11
N0dot=N-N1dot
Ndot0=N-Ndot1
N00=Ndot0-N10
if N11<=0:
N11=1
if N01<=0:
N01=1
if N10<=0:
N10=1
if N00<=0:
N00=1
if Ndot0<=0:
Ndot0=1
if Ndot1<=0:
Ndot1=1
if N1dot<=0:
N1dot=1
if N0dot<=0:
N0dot=1
mi=(N11/N)*math.log(N*(N11)/(N1dot*Ndot1))
+(N01/N)*math.log(N*N01/(N0dot*Ndot1))
+(N10/N)*math.log(N*N10/(N1dot*Ndot0))
+(N00/N)*math.log(N*N00/(Ndot0*N0dot))
wordMutualInformation[k1]=mi
for kt,vt in wordMutualInformation.items():
if vt>0.0084:
if kt not in featureTable:
featureTable.append(kt)
return featureTable
def informationGain():
prepath = r"../trainTemp/"
featureTable = []
documentNumInClass = dict() # stastic the number of document in every class
wordFrequentInClass = dict() # stastic the number of document containing word i in class
wordDocumentNum = dict() # stastic the number of document containing word i
for label in labels.labels:
documentNumInClass[label.name]=0
tempWordFrequent=dict()
for fileIndex in range(10, 1900):
path = prepath + label.value + "/" + str(fileIndex) + ".txt"
if os.path.exists(path):
documentNumInClass[label.name]+=1
inputfile = open(path, encoding="utf-8", errors="ignore")
wordSet=set()
for line in inputfile.readlines():
line = line.strip('\n')
wordSet.add(line)
for s in wordSet:
if s in tempWordFrequent.keys():
tempWordFrequent[s]+=1
else:
tempWordFrequent[s]=1
if s in wordDocumentNum.keys():
wordDocumentNum[s]+=1
else:
wordDocumentNum[s]=1
wordFrequentInClass[label.name]=tempWordFrequent
N=0
for k,v in documentNumInClass.items():
N+=v
beginEntropy=0
for label in labels.labels:
P=documentNumInClass[label.name]/N
beginEntropy+=-P*math.log(P)
for k,v in wordDocumentNum.items():
endEntropy_t=0
endEntropy_not_t=0
P_t = v/N
P_not_t=1-P_t
for labelName,wordFrequent in wordFrequentInClass.items():
c_and_t=0
if k in wordFrequent.keys():
c_and_t=wordFrequent[k]
c_and_not_t=documentNumInClass[labelName]-c_and_t
P_c_In_t=c_and_t/v
P_c_In_not_t=c_and_not_t/(N-wordDocumentNum[k])
if P_c_In_t!=0:
endEntropy_t+=P_c_In_t*math.log(P_c_In_t)
if P_c_In_not_t!=0:
endEntropy_not_t+=P_c_In_not_t*math.log(P_c_In_not_t)
IG=endEntropy_t*P_t+endEntropy_not_t*P_not_t+beginEntropy
if IG>0.0083: #0.006
featureTable.append(k)
return featureTable
# print(len(informationGain()))
# print(len(mutualInformation()))
# print(len((documentfrequency())))
#documentfrequency()
# start=time.time()
# t=mutualInformation()
# print("length:"+str(len(t)))
# end=time.time()
# print(end-start)