bag-of-words model的java实现
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了bag-of-words model的java实现,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含4980字,纯文字阅读大概需要8分钟。
内容图文
![bag-of-words model的java实现](/upload/InfoBanner/zyjiaocheng/1197/5f0c0f563538488db506c8d0fea7ea1d.jpg)
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.util.StringTokenizer; public class BowModel { Dict dict; DocFeatureFactory dff; public BowModel(String path) throws Throwable { dict = new Dict(); dict.loadFromLocalFile(path); dff = new DocFeatureFactory(dict.getWord2Index()); } double[][] featureTable; private void generateFeature(String docsFile,int docNum) throws IOException { featureTable = new double[docNum][]; int docIndex=0; File file = new File(docsFile); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); while(true) { String line=br.readLine(); if(line == null) break; featureTable[docIndex++] = dff.getFeature(line); } br.close(); } private void nomalizeFeature() { double sum=0; double var =0; for(int col=0;col<featureTable[0].length;col++)//一列代表一个维度 { sum =0; for(int row=0;row<featureTable.length;row++) { sum+= featureTable[row][col]; } sum/=featureTable.length;//均值 var =0; for(int row=0;row<featureTable.length;row++) { var+= (featureTable[row][col]-sum)*(featureTable[row][col]-sum); } var = Math.sqrt(var/featureTable.length);//标准差 if(var == 0) continue; for(int row=0;row<featureTable.length;row++) { featureTable[row][col] = (featureTable[row][col] -sum)/var; } } } private void saveFeature(String path,String label) throws IOException { File file=new File(path); BufferedWriter br= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file))); for(int i=0;i<featureTable.length;i++) { br.append(label+" "); for(int j=0;j<featureTable[0].length;j++) { br.append(Integer.toString(j+1)+":"+featureTable[i][j]+" "); } br.append("\n"); } br.close(); } public void train() throws IOException { generateFeature("/media/linger/G/sources/comment/test/good",340); nomalizeFeature(); saveFeature("svm_good","1"); generateFeature("/media/linger/G/sources/comment/test/bad",314); nomalizeFeature(); saveFeature("svm_bad","-1"); } public static void main(String[] args) throws Throwable { // TODO Auto-generated method stub BowModel bm = new BowModel("/media/linger/G/sources/comment/test/dict"); bm.train(); } }
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; import java.util.StringTokenizer; public class Dict { HashMap<String,Integer> word2Index =null; Hashtable<String,Integer> word2Count = null; void loadFromLocalFile(String path) throws IOException { word2Index = new HashMap<String,Integer>(); word2Count = new Hashtable<String,Integer>(); int index = 0; File file = new File(path); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); while(true) { String line=br.readLine(); if(line == null) break; StringTokenizer tokenizer=new StringTokenizer(line," "); while(tokenizer.hasMoreElements()) { String term=tokenizer.nextToken(); if(word2Count.containsKey(term)) { int freq=word2Count.get(term)+1; word2Count.put(term, freq); } else { word2Count.put(term, 1); word2Index.put(term, index++); } } } br.close(); } public HashMap<String,Integer> getWord2Index() throws Throwable { if(word2Index==null) throw new Exception("has not loaded file!"); return word2Index; } public static void main(String[] args) { // TODO Auto-generated method stub } }
import java.util.HashMap; import java.util.StringTokenizer; public class DocFeatureFactory { HashMap<String,Integer> word2Index; double[] feature; int dim; public DocFeatureFactory(HashMap<String,Integer> w2i) { word2Index = w2i; dim = w2i.size(); } double[] getFeature(String doc) { feature = new double[dim]; StringTokenizer tokenizer=new StringTokenizer(doc," "); while(tokenizer.hasMoreElements()) { String term =tokenizer.nextToken(); feature[word2Index.get(term)]++; } return feature; } public static void main(String[] args) { // TODO Auto-generated method stub } }
原文:http://blog.csdn.net/lingerlanlan/article/details/38333687
内容总结
以上是互联网集市为您收集整理的bag-of-words model的java实现全部内容,希望文章能够帮你解决bag-of-words model的java实现所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。