# -*- coding:utf-8 -*-
#author TCY
#email tangchenyi@mail.hfut.edu.cn
#date 2015/04/03
import os
import math
import sys
class FeatureSelector:
#supported algorithms
TYPE_CHI="CHI"
TYPE_MI="MI"
TYPE_IG="IG"
Type=(TYPE_CHI, TYPE_MI, TYPE_IG)
def __init__(self, selectorType, srcDataPath, dstDicPath, numOfKind):
if(selectorType not in self.Type):
raise ValueError,"selectorType error"
self.__selectorType = selectorType
self.__srcDataPath = srcDataPath
self.__dstDicPath = dstDicPath
self.__numOfKind = numOfKind
pass
def select(self):
if(self.__selectorType == self.TYPE_CHI):
self.run_CHI()
elif(self.__selectorType == self.TYPE_MI):
self.run_MI()
elif(self.__selectorType == self.TYPE_IG):
self.run_IG()
pass
def run_CHI(self):
self.count()
self.calculate_CHI()
return self.__terms_value
pass
def run_MI(self):
self.count()
self.calculate_MI()
return self.__terms_value
pass
def run_IG(self):
self.count()
self.calculate_IG()
return self.__terms_value
pass
def count(self):
fp = open(self.__srcDataPath, "r")
terms_kind=dict()
numOfLine = 0;
numOfEachKind = [None]*self.__numOfKind
for i in range(self.__numOfKind):
numOfEachKind[i] = 0
for line in fp:
terms = line.split()
label=int(terms[0])
unique_terms=set()
for term in terms[1:]:
unique_terms.add(term)
for term in unique_terms:
if term not in terms_kind:
kind=[None]*self.__numOfKind
for i in range(self.__numOfKind):
kind[i] = 0
kind[label]=1
terms_kind[term]=kind
else:
kind=terms_kind[term]
kind[label]=kind[label]+1
numOfLine+=1
numOfEachKind[label]+=1
self.__numOfLine = numOfLine
self.__terms_kind=terms_kind
self.__numOfEachKind = numOfEachKind
pass
def calculate_CHI(self):
N = self.__numOfLine
terms_CHI = dict()
for term in self.__terms_kind.items():
kind = term[1]
numOfThisTerm = 0
CHIs = []
for i in range(self.__numOfKind):
numOfThisTerm += kind[i]
for i in range(self.__numOfKind):
A = kind[i]
B = numOfThisTerm - A
C = self.__numOfEachKind[i] - A
#D=N-A-B-C
D = N - numOfThisTerm - C
#CHI=N*(A*D-B*C)^2/((A+C)*(A+B)*(B+D)*(C+D))
CHI = N * math.pow((A*D-B*C),2)/((A+C)*(A+B)*(B+D)*(C+D))
CHIs.append(CHI)
CHIs.sort()
terms_CHI[term[0]]=CHIs[self.__numOfKind-1]
self.__terms_value = terms_CHI
def calculate_IG(self):
N = self.__numOfLine+0.0
terms_IG = dict()
C = [None]*self.__numOfKind
PC = [None]*self.__numOfKind
tmp_PC = 0;
for i in range(self.__numOfKind):
C[i] = self.__numOfEachKind[i]
PC[i] = C[i] / N
tmp_PC += PC[i] * math.log(PC[i], 2)
for term in self.__terms_kind.items():
kind = term[1]
numOfThisTerm = 0
for i in range(self.__numOfKind):
numOfThisTerm += kind[i]
A = numOfThisTerm
PA = A / N
PA_ = 1 - PA
tmp_PB = 0
tmp_PB_ = 0
for i in range(self.__numOfKind):
B = kind[i]
B_ = C[i] - B
PB = B / A
PB_ = B_ / (N - A)
if PB != 0:
tmp_PB += PB*math.log(PB,2)
if PB_ != 0:
tmp_PB_ += PB_*math.log(PB_,2)
IG = PA * tmp_PB + PA_ * tmp_PB_ - tmp_PC
terms_IG[term[0]]=IG
self.__terms_value = terms_IG
pass
def calculate_MI(self):
N = self.__numOfLine
terms_MI = dict()
for term in self.__terms_kind.items():
kind = term[1]
numOfThisTerm = 0
MIs = []
for i in range(self.__numOfKind):
numOfThisTerm += kind[i]
for i in range(self.__numOfKind):
A = kind[i]
B = numOfThisTerm - A
C = self.__numOfEachKind[i] - A
#MI=lg(A*N/((A+C)(A+B))
if A != 0:
MI = math.log(N*A/((A+C)*(A+B)))
else :
MI = 0
MIs.append(MI)
MIs.sort()
terms_MI[term[0]]=MIs[self.__numOfKind-1]
self.__terms_value = terms_MI
pass
def printMessage(self):
fp = open(self.__dstDicPath, "w")
for item in self.__terms_value.items():
fp.write("%s\t%f\n"%item)
fp.close()
if __name__ == "__main__":
selector= FeatureSelector("IG", "test.txt", "dic.txt", 3)
selector.select()
selector.printMessage()