基于NLPIR分词工具的识别准确语句程序java版（2-Gram）资源-CSDN文库

共88个文件

map：16个

pdat：14个

user：12个

需积分: 10 120 浏览量 2016-03-21 13:40:32 上传评论收藏 10.93MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

wordseg.zip （88个子文件）

wordseg

文档.doc 134KB

2.分析正确语句

code

main.java 8KB

1.训练源代码

code

main.java 5KB

NLIPIR 分词工具java版

20131024.err 190B

src

code

main.java 4KB

bin

code

main$CLibrary.class 760B

main.class 2KB

.classpath 353B

20160303.err 291B

.settings

org.eclipse.core.resources.prefs 88B

org.eclipse.jdt.core.prefs 629B

.project 383B

Data

nr.ctx 2KB

CoreDict.unig 467KB

FTU8.wordlist 186KB

CoreDict.pos 1.7MB

GBK.pdat 536KB

GBKC.wordlist 163KB

UTF2GBKA.map 279KB

ICTPOS.map 422B

BIG5.wordlist 155KB

GBKA2UTF.map 279KB

NLPIR.user 3KB

sentiment.ung 86KB

DocExtractor.user 3KB

sentiment.user 3KB

NLPIR.ctx 36KB

BIG2GBK.map 279KB

20160119.err 305B

UTF8.wordlist 186KB

GBKA.wordlist 163KB

location.pdat 407KB

Configure.xml 1KB

FieldDict.pos 30B

GBKA.pdat 538KB

PKU.map 323B

FieldDict.pdat 256KB

sentiment.pdat 834KB

English

English.ung 1.6MB

Irrel2regular.map 955KB

ne.pos 1.22MB

English.wordlist 2.74MB

English.pdat 5.06MB

ne.pdat 1.11MB

English.pos 4.29MB

ne.wordlist 653KB

keyExtract.user 3KB

GBK2GBKC.map 279KB

PKU_First.map 300B

20141225.err 92B

GranDict.pos 1.7MB

summary.user 3KB

charset.type 64KB

deepclassifier.user 3KB

GBK.wordlist 163KB

CoreDict.pdat 1.62MB

BiWord.big 3.36MB

NewWord.lst 5KB

BIG5.pdat 457KB

cluster.user 3KB

20151225.err 137B

FTU8.pdat 534KB

UTF2GBK.map 279KB

20160229.err 368B

JZSearch.user 3KB

20151228.err 92B

NLPIR_First.map 288B

GBK2BIG.map 279KB

location.wordlist 104KB

LJHtmlParser.user 3KB

nr.fsa 3KB

FTU82GBK.map 279KB

GBK2UTF.map 279KB

UserDefinedDict.lst 22B

GranDict.pdat 1.89MB

nr.role 1.68MB

LJRedupRemover.user 3KB

classifier.user 3KB

location.map 78KB

GBKC2GBK.map 279KB

NLPIR_trial.user 3KB

GBKC.pdat 538KB

20160107.err 92B

UserDict.pdat 34KB

GBK2FTU8.map 279KB

UTF8.pdat 544KB

NLPIR.dll 1.6MB

jna-4.0.0.jar 893KB

package code; import java.awt.Color; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.Scanner; import java.util.Map.Entry; import javafx.application.Application; import javafx.geometry.Insets; import javafx.scene.Node; import javafx.scene.Scene; import javafx.scene.control.Button; import javafx.scene.control.Label; import javafx.scene.control.TextArea; import javafx.scene.control.TextField; import javafx.scene.layout.BorderPane; import javafx.scene.layout.HBox; import javafx.scene.layout.VBox; import javafx.stage.Stage; import com.sun.jna.Library; import com.sun.jna.Native; import com.sun.jna.Platform; public class main extends Application{ static int totalTokensCount=0; static int totalTwoTokens=0; static String [] ah=new String[1000]; static String [][] b=new String [1000][1000]; BorderPane pane=new BorderPane(); TextArea a=new TextArea(); static Map<String,Integer> wordCountMap = new HashMap<String,Integer>(); // 定义接口CLibrary，继承自com.sun.jna.Library public interface CLibrary extends Library { // 定义并初始化接口的静态变量 CLibrary Instance = (CLibrary) Native.loadLibrary( "C:\\Users\\hongtao\\Desktop\\汉语分词20140928\\sample\\Java\\jnaTest\\jnaTest\\NLPIR", CLibrary.class); // printf函数声明 public boolean NLPIR_Init(byte[] sDataPath, int encoding, byte[] sLicenceCode); public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged); public String NLPIR_GetKeyWords(String sLine,int nMaxKeyLimit,boolean bWeightOut); public void NLPIR_Exit(); } public static String transString(String aidString, String ori_encoding, String new_encoding) { try { return new String(aidString.getBytes(ori_encoding), new_encoding); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return null; } public static void main(String[] args) throws Exception { Application.launch(args); } public static int probBetweenTowTokens(String t1,String t2){ String t1t2 = t1+t2; int count = wordCountMap.get(t1t2)==null? 0: wordCountMap.get(t1t2); //System.out.println(t1t2+"words string count is : "+count); if (totalTokensCount > 0 ){ return count; } else return 0; } public static int probBetweenTowTokens(String token)throws Exception{ int count = wordCountMap.get(token)==null? 0: wordCountMap.get(token); //System.out.println(token+"words string count is : "+count); if (totalTokensCount > 0 ){ return count; } else return 0; } public static void calculateTokenCount(String afterWordSegFile) throws IOException{ File movieInfoFile = new File(afterWordSegFile); BufferedReader movieBR = null; try { movieBR = new BufferedReader(new FileReader(movieInfoFile)); } catch (FileNotFoundException e) { System.out.println("movie_result.txt file not found"); e.printStackTrace(); } String wordsline = null; try { while ((wordsline=movieBR.readLine()) != null){ String[] words = wordsline.trim().split(" "); for (int i=0;i<words.length;i++){ int wordCount = (wordCountMap.get(words[i])==null )? 0:wordCountMap.get(words[i]); wordCountMap.put(words[i], wordCount+1); totalTokensCount += 1; if (words.length > 1 && i < words.length-1){ StringBuffer wordStrBuf = new StringBuffer(); wordStrBuf.append(words[i]).append(words[i+1]); int wordStrCount = wordCountMap.get(wordStrBuf.toString())==null ? 0:wordCountMap.get(wordStrBuf.toString()); wordCountMap.put(wordStrBuf.toString(), wordStrCount+1); totalTwoTokens += 1; } } } } catch (IOException e) { System.out.println("read movie_result.txt file failed"); e.printStackTrace(); } //int t=0; //for(Entry<String, Integer> m: wordCountMap.entrySet()){ // String s=null; // t++; // s=m.getKey()+" "+m.getValue()+" "; // System.out.print(s); // if(t%15==0) // System.out.println(); // } } public void start(Stage primaryStage) throws Exception{ pane.setLeft(getHBox()); pane.setStyle("-fx-boder-color:red;-fx-background-color:lightblue;"); Scene scene=new Scene(pane,900,500); primaryStage.setTitle("识别正确语序"); primaryStage.setScene(scene); primaryStage.show(); } public VBox getHBox()throws Exception{ VBox hBox =new VBox(30); a.setPrefSize(400, 500); hBox.setPadding(new Insets(5,15,5,15)); Label h=new Label("请输入你要测试的句子："); Button OK=new Button("分词并得到正确语句"); OK.setOnAction(e->{try { getVBox(); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); }}); hBox.getChildren().add(h); hBox.getChildren().add(a); hBox.getChildren().add(OK); return hBox; } public void getVBox()throws Exception{ VBox vbox=new VBox(15); vbox.setPadding(new Insets(50,550,5,5)); Label x=new Label("2-Gram训练结果："); vbox.getChildren().add(x); String argu = ""; int i=0; double [] d=new double[100]; String s=a.getText(); String t[]=s.split("\n"); String system_charset = "UTF-8"; int charset_type = 1; if (!CLibrary.Instance.NLPIR_Init(argu.getBytes(system_charset), charset_type, "0".getBytes(system_charset))) { System.err.println("初始化失败！"); } Scanner input=new Scanner(System.in); System.out.println("界面程序端运行正常！"); for(i=0;i<t.length;){ ah[i]=t[i]; i++; } String nativeBytes=null; try { for(int j=0;j<i;j++){ nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(ah[j], 0); String c[]=nativeBytes.trim().split(" "); for(int k=0;k<c.length;k++){ b[j][k]=c[k]; } } } catch (Exception ex) { // TODO Auto-generated catch block ex.printStackTrace(); } calculateTokenCount("C:\\Users\\hongtao\\wordseg\\afterSeg.txt"); for(int k=0;k<i;k++){ int j=0; d[k]=1.0; d[k]=Math.log(d[k]*probBetweenTowTokens(b[k][0])*1.0/totalTokensCount); while(b[k][j+1]!=null){ if(probBetweenTowTokens(b[k][j],b[k][j+1])!=0) d[k]= d[k]+Math.log((1.0*probBetweenTowTokens(b[k][j],b[k][j+1]))/1.0*probBetweenTowTokens(b[k][j])); //if(probBetweenTowTokens(b[k][j])==0) // System.out.println("不存在词汇"+b[k][j]); //if(probBetweenTowTokens(b[k][j],b[k][j+1])==0) // System.out.println("不存在词汇："+b[k][j]+b[k][j+1]); j++; } } int k=0; while(d[k]!=0.0){ Label h=new Label(ah[k]+" "+d[k]); k++; vbox.getChildren().add(h); } Label g=new Label("\n\n\n正确的语句有："); vbox.getChildren().add(g); k=0; int flag=0; while(d[k]!=0.0){ if(d[k]>45){ Label h=new Label(ah[k]); vbox.getChildren().add(h); flag++; } k++; } if(flag==0) { Label h=new Label("输入的都是不正确语序的句子！"); vbox.getChildren().add(h);} pane.setRight(vbox); } }

评论收藏

内容反馈