本文共 3324 字,大约阅读时间需要 11 分钟。
已经分好词的文本中还有许多与我们需要无关的词,这就需要我们进行停词过滤了,而wvtool已经实现了这样的功能! 废话不多说,上代码吧:
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileReader;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.io.StringReader;
-
- import edu.udo.cs.wvtool.config.WVTConfiguration;
- import edu.udo.cs.wvtool.generic.inputfilter.SelectingInputFilter;
- import edu.udo.cs.wvtool.generic.loader.UniversalLoader;
- import edu.udo.cs.wvtool.generic.tokenizer.NGramTokenizer;
- import edu.udo.cs.wvtool.generic.tokenizer.SimpleTokenizer;
- import edu.udo.cs.wvtool.generic.wordfilter.StopWordFilterFile;
- import edu.udo.cs.wvtool.main.WVTDocumentInfo;
- import edu.udo.cs.wvtool.util.TokenEnumeration;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- public class Demo01_1 {
- public static String pathString="D:\\工作管理\\weka学习\\wvtool-1.1\\wvtool-1.1\\examples\\data\\";
- public static void main(String[] args) throws Exception {
-
-
-
- UniversalLoader loader=new UniversalLoader();
-
-
-
- WVTDocumentInfo info=new WVTDocumentInfo(pathString+"text.html", "html", "utf-8", "chinese");
-
-
-
- InputStream stream=loader.loadDocument(info);
-
-
-
-
-
-
-
-
-
- SelectingInputFilter filter=new SelectingInputFilter();
-
-
-
- Reader readers=filter.convertToPlainText(stream,info);
-
-
-
- BufferedReader reader=new BufferedReader(readers);
-
- String string=reader.readLine().toString();
-
-
- string=string.replace(string.valueOf((char)9), "");
-
- Reader reader2=new StringReader(string);
-
-
-
-
-
- NGramTokenizer tokenizer=new NGramTokenizer(1,new SimpleTokenizer());
-
-
- TokenEnumeration enumeration=tokenizer.tokenize(reader2, info);
-
-
-
-
- File file=new File(pathString+"ext_stopword.dic");
-
-
-
-
-
- Reader reader3=new InputStreamReader(new FileInputStream(file),"unicode");
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- StopWordFilterFile filter1=new StopWordFilterFile(1, reader3);
-
- TokenEnumeration enumeration2=filter1.filter(enumeration, info);
-
- while(enumeration2.hasMoreTokens()){
- System.out.print(enumeration2.nextToken()+"/");
- }
- System.out.println();
-
-
-
-
-
- }
- }
弄了这么久,终于可以慢慢深入进去了!
转载地址:http://gxpla.baihongyu.com/