#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#define SUM 40000 //总词条数的最大数目
#define MAX 10000 //某文档的最大词条数
#define TEXT 560 //某个类别的总文本数
//结构体,用来存放(词条,权重)向量组
typedef struct
{
char word[50];
double v;
}WORD;
WORD vwords[SUM]; //存放(词条,权重)数组
WORD topwords[5000]; //存放前5000个特征词条项
//计算特征权重值,返回值为特征权重
double calculate(int tf,int df,int max)
{
double t1,t2,idf,v;
int t3;
t1=(double)tf/(double)max; //词频
t3=1;
t2=(double)TEXT/(double)df+(double)t3; //词料库的文档总数除以df,加1保证log(t2)大于0
idf=log(t2);
v=t1*idf;
return v;
}
//按权重排序
void change(WORD vwords[],int count) //count为总词条数
{
int i,j,m;
double t;
char temp[50];
memset(temp,'\0',sizeof(temp));
for(i=0;i<count-1;i++)
{
m=i;
for(j=i+1;j<count;j++)
if(vwords[j].v>vwords[m].v) //逆序则交换
m=j;
if(i!=m)
{
strcpy(temp,vwords[i].word);
strcpy(vwords[i].word,vwords[m].word);
strcpy(vwords[m].word,temp);
t=vwords[i].v;
vwords[i].v=vwords[m].v;
vwords[m].v=t;
}
}
}
void main()
{
// WORD topwords[5000];
FILE *fp,*out;
char temp[50],words[SUM][50],ct; //temp读取提取特征中的一个词,words保存总的特征词条
int i,j,k,m,df,tf,count; //count为特征词条总数
count=0;df=0;tf=0;i=0;j=0;k=0;
char h[]="G:\\数据集\\去停用词后数据集\\艺术\\C"; //某类别总文本的存放位置
char r1[]="G:\\数据集\\提取特征向量集\\艺术.txt"; //提取的总特征存放位置
char r2[100];
char wname[]="G:\\数据集\\权重向量集\\艺术.txt"; //输出所有(特征,权重)对的位置
char top5000[]="G:\\数据集\\前5000个特征权重\\艺术.txt"; //输出前5000个(特征,权重)对的位置
if((fp=fopen(r1,"r"))==NULL)
{
printf("Open file error!");
exit(0);
}
ct=fgetc(fp);
while(ct!=EOF) //保存总特征词条到words数组
{
if(ct=='\n')
{
words[i][j]='\0';
j=0;
i++;
count++;
}
else
{
words[i][j]=ct;
j++;
}
ct=fgetc(fp);
}
words[i][j]='\0';
fclose(fp);
printf("总词条数为:%d\n",count);
i=0;j=0;
double vector; //暂存权重值
int max=0; //出现次数最多的词
int flag=0;
for(i=0;i<count;i++)
{
df=0;tf=0;
for(m=1;m<=TEXT;m++)
{
flag=0;
sprintf(r2,"%s%d.txt",h,m);
if((fp=fopen(r2,"r"))==NULL)
{
printf("Open file error!\n");
exit(0);
}
ct=fgetc(fp);
while(ct!=EOF)
{
if(ct=='\n')
{
temp[j]='\0';
j=0;
if(strcmp(temp,words[i])==0)
{
flag=1;
tf++;
}
}
else
{
temp[j]=ct;
j++;
}
ct=fgetc(fp);
}
fclose(fp);
if(flag>0)
df++;
if(tf>max)
max=tf;
}
vector=calculate(tf,df,max); //计算权重,返回结果用vector保存
strcpy(vwords[i].word,words[i]);
vwords[i].v=vector;
printf("SUCCEED %d\n",i+1);
}
change(vwords,count); //将特征词条按权重排序
if((out=fopen(wname,"w"))==NULL)
{
printf("Open file error!\n");
exit(0);
}
for(i=0;i<count;i++) //保存所有特征词条
fprintf(out,"%d %s %lf\n",i+1,vwords[i].word,vwords[i].v);
fclose(out);
if((out=fopen(top5000,"w"))==NULL)
{
printf("Open file error!\n");
exit(0);
}
for(i=0;i<5000;i++) //保存前5000个特征词条
fprintf(out,"%s %lf\n",vwords[i].word,vwords[i].v);
fclose(out);
}