package code;
import java.awt.Color;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.Map.Entry;
import javafx.application.Application;
import javafx.geometry.Insets;
import javafx.scene.Node;
import javafx.scene.Scene;
import javafx.scene.control.Button;
import javafx.scene.control.Label;
import javafx.scene.control.TextArea;
import javafx.scene.control.TextField;
import javafx.scene.layout.BorderPane;
import javafx.scene.layout.HBox;
import javafx.scene.layout.VBox;
import javafx.stage.Stage;
import com.sun.jna.Library;
import com.sun.jna.Native;
import com.sun.jna.Platform;
public class main extends Application{
static int totalTokensCount=0;
static int totalTwoTokens=0;
static String [] ah=new String[1000];
static String [][] b=new String [1000][1000];
BorderPane pane=new BorderPane();
TextArea a=new TextArea();
static Map<String,Integer> wordCountMap = new HashMap<String,Integer>();
// 定义接口CLibrary,继承自com.sun.jna.Library
public interface CLibrary extends Library {
// 定义并初始化接口的静态变量
CLibrary Instance = (CLibrary) Native.loadLibrary(
"C:\\Users\\hongtao\\Desktop\\汉语分词20140928\\sample\\Java\\jnaTest\\jnaTest\\NLPIR", CLibrary.class);
// printf函数声明
public boolean NLPIR_Init(byte[] sDataPath, int encoding,
byte[] sLicenceCode);
public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged);
public String NLPIR_GetKeyWords(String sLine,int nMaxKeyLimit,boolean bWeightOut);
public void NLPIR_Exit();
}
public static String transString(String aidString, String ori_encoding,
String new_encoding) {
try {
return new String(aidString.getBytes(ori_encoding), new_encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) throws Exception {
Application.launch(args);
}
public static int probBetweenTowTokens(String t1,String t2){
String t1t2 = t1+t2;
int count = wordCountMap.get(t1t2)==null? 0: wordCountMap.get(t1t2);
//System.out.println(t1t2+"words string count is : "+count);
if (totalTokensCount > 0 ){
return count;
}
else
return 0;
}
public static int probBetweenTowTokens(String token)throws Exception{
int count = wordCountMap.get(token)==null? 0: wordCountMap.get(token);
//System.out.println(token+"words string count is : "+count);
if (totalTokensCount > 0 ){
return count;
}
else
return 0;
}
public static void calculateTokenCount(String afterWordSegFile) throws IOException{
File movieInfoFile = new File(afterWordSegFile);
BufferedReader movieBR = null;
try {
movieBR = new BufferedReader(new FileReader(movieInfoFile));
} catch (FileNotFoundException e) {
System.out.println("movie_result.txt file not found");
e.printStackTrace();
}
String wordsline = null;
try {
while ((wordsline=movieBR.readLine()) != null){
String[] words = wordsline.trim().split(" ");
for (int i=0;i<words.length;i++){
int wordCount = (wordCountMap.get(words[i])==null )? 0:wordCountMap.get(words[i]);
wordCountMap.put(words[i], wordCount+1);
totalTokensCount += 1;
if (words.length > 1 && i < words.length-1){
StringBuffer wordStrBuf = new StringBuffer();
wordStrBuf.append(words[i]).append(words[i+1]);
int wordStrCount = wordCountMap.get(wordStrBuf.toString())==null ? 0:wordCountMap.get(wordStrBuf.toString());
wordCountMap.put(wordStrBuf.toString(), wordStrCount+1);
totalTwoTokens += 1;
}
}
}
} catch (IOException e) {
System.out.println("read movie_result.txt file failed");
e.printStackTrace();
}
//int t=0;
//for(Entry<String, Integer> m: wordCountMap.entrySet()){
// String s=null;
// t++;
// s=m.getKey()+" "+m.getValue()+" ";
// System.out.print(s);
// if(t%15==0)
// System.out.println();
// }
}
public void start(Stage primaryStage) throws Exception{
pane.setLeft(getHBox());
pane.setStyle("-fx-boder-color:red;-fx-background-color:lightblue;");
Scene scene=new Scene(pane,900,500);
primaryStage.setTitle("识别正确语序");
primaryStage.setScene(scene);
primaryStage.show();
}
public VBox getHBox()throws Exception{
VBox hBox =new VBox(30);
a.setPrefSize(400, 500);
hBox.setPadding(new Insets(5,15,5,15));
Label h=new Label("请输入你要测试的句子:");
Button OK=new Button("分词并得到正确语句");
OK.setOnAction(e->{try {
getVBox();
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}});
hBox.getChildren().add(h);
hBox.getChildren().add(a);
hBox.getChildren().add(OK);
return hBox;
}
public void getVBox()throws Exception{
VBox vbox=new VBox(15);
vbox.setPadding(new Insets(50,550,5,5));
Label x=new Label("2-Gram训练结果:");
vbox.getChildren().add(x);
String argu = "";
int i=0;
double [] d=new double[100];
String s=a.getText();
String t[]=s.split("\n");
String system_charset = "UTF-8";
int charset_type = 1;
if (!CLibrary.Instance.NLPIR_Init(argu.getBytes(system_charset),
charset_type, "0".getBytes(system_charset))) {
System.err.println("初始化失败!");
}
Scanner input=new Scanner(System.in);
System.out.println("界面程序端运行正常!");
for(i=0;i<t.length;){
ah[i]=t[i];
i++;
}
String nativeBytes=null;
try {
for(int j=0;j<i;j++){
nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(ah[j], 0);
String c[]=nativeBytes.trim().split(" ");
for(int k=0;k<c.length;k++){
b[j][k]=c[k];
}
}
} catch (Exception ex) {
// TODO Auto-generated catch block
ex.printStackTrace();
}
calculateTokenCount("C:\\Users\\hongtao\\wordseg\\afterSeg.txt");
for(int k=0;k<i;k++){
int j=0;
d[k]=1.0;
d[k]=Math.log(d[k]*probBetweenTowTokens(b[k][0])*1.0/totalTokensCount);
while(b[k][j+1]!=null){
if(probBetweenTowTokens(b[k][j],b[k][j+1])!=0)
d[k]= d[k]+Math.log((1.0*probBetweenTowTokens(b[k][j],b[k][j+1]))/1.0*probBetweenTowTokens(b[k][j]));
//if(probBetweenTowTokens(b[k][j])==0)
// System.out.println("不存在词汇"+b[k][j]);
//if(probBetweenTowTokens(b[k][j],b[k][j+1])==0)
// System.out.println("不存在词汇:"+b[k][j]+b[k][j+1]);
j++;
}
}
int k=0;
while(d[k]!=0.0){
Label h=new Label(ah[k]+" "+d[k]);
k++;
vbox.getChildren().add(h);
}
Label g=new Label("\n\n\n正确的语句有:");
vbox.getChildren().add(g);
k=0;
int flag=0;
while(d[k]!=0.0){
if(d[k]>45){
Label h=new Label(ah[k]);
vbox.getChildren().add(h);
flag++;
}
k++;
}
if(flag==0)
{
Label h=new Label("输入的都是不正确语序的句子!");
vbox.getChildren().add(h);}
pane.setRight(vbox);
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
基于NLPIR分词工具的识别准确语句程序java版(2-Gram)
共88个文件
map:16个
pdat:14个
user:12个
需积分: 10 39 下载量 120 浏览量
2016-03-21
13:40:32
上传
评论
收藏 10.93MB ZIP 举报
温馨提示
自然语言处理(Natural Language Processing),马尔科夫假设,根据给出的语料库(大量语法合法的文本),训练n-gram模型。根据训练出的模型,判断测试集中每个句子是不是语法合法的句子.。根据语料库训练n-gram模型。根据训练出来的n-gram模型对测试集中的每个句子赋予概率值,判断句子是否符合语法。
资源推荐
资源详情
资源评论
收起资源包目录
wordseg.zip (88个子文件)
wordseg
文档.doc 134KB
2.分析正确语句
code
main.java 8KB
1.训练源代码
code
main.java 5KB
NLIPIR 分词工具java版
20131024.err 190B
src
code
main.java 4KB
bin
code
main$CLibrary.class 760B
main.class 2KB
.classpath 353B
20160303.err 291B
.settings
org.eclipse.core.resources.prefs 88B
org.eclipse.jdt.core.prefs 629B
.project 383B
Data
nr.ctx 2KB
CoreDict.unig 467KB
FTU8.wordlist 186KB
CoreDict.pos 1.7MB
GBK.pdat 536KB
GBKC.wordlist 163KB
UTF2GBKA.map 279KB
ICTPOS.map 422B
BIG5.wordlist 155KB
GBKA2UTF.map 279KB
NLPIR.user 3KB
sentiment.ung 86KB
DocExtractor.user 3KB
sentiment.user 3KB
NLPIR.ctx 36KB
BIG2GBK.map 279KB
20160119.err 305B
UTF8.wordlist 186KB
GBKA.wordlist 163KB
location.pdat 407KB
Configure.xml 1KB
FieldDict.pos 30B
GBKA.pdat 538KB
PKU.map 323B
FieldDict.pdat 256KB
sentiment.pdat 834KB
English
English.ung 1.6MB
Irrel2regular.map 955KB
ne.pos 1.22MB
English.wordlist 2.74MB
English.pdat 5.06MB
ne.pdat 1.11MB
English.pos 4.29MB
ne.wordlist 653KB
keyExtract.user 3KB
GBK2GBKC.map 279KB
PKU_First.map 300B
20141225.err 92B
GranDict.pos 1.7MB
summary.user 3KB
charset.type 64KB
deepclassifier.user 3KB
GBK.wordlist 163KB
CoreDict.pdat 1.62MB
BiWord.big 3.36MB
NewWord.lst 5KB
BIG5.pdat 457KB
cluster.user 3KB
20151225.err 137B
FTU8.pdat 534KB
UTF2GBK.map 279KB
20160229.err 368B
JZSearch.user 3KB
20151228.err 92B
NLPIR_First.map 288B
GBK2BIG.map 279KB
location.wordlist 104KB
LJHtmlParser.user 3KB
nr.fsa 3KB
FTU82GBK.map 279KB
GBK2UTF.map 279KB
UserDefinedDict.lst 22B
GranDict.pdat 1.89MB
nr.role 1.68MB
LJRedupRemover.user 3KB
classifier.user 3KB
location.map 78KB
GBKC2GBK.map 279KB
NLPIR_trial.user 3KB
GBKC.pdat 538KB
20160107.err 92B
UserDict.pdat 34KB
GBK2FTU8.map 279KB
UTF8.pdat 544KB
NLPIR.dll 1.6MB
jna-4.0.0.jar 893KB
共 88 条
- 1
资源评论
hongtao_fan
- 粉丝: 7
- 资源: 4
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功