赞
踩
花了点时间做了一个词频分析统计程序,分析了《失控》英文版(out_of_control)——Kevin_Kelly 的词频。要想流利的阅读英语文章,词汇量应该达到2万左右。
从词频分析来看,只在文章中出现一次或两次的单词占了一半以上,是长尾理论的具体体现(举例来说,我们常用的汉字实际上不多,但因出现频次高,所以这些为数不多的汉字占据了右图广大的红区;绝大部分的汉字难得一用,它们就属于长尾。)背单词的时候,这些出现不多的单词我们也要注意。
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.galaxy.pub.util.FileUtil;
public class English {
public static void main(String[] args) {
analysis(new File("D:\\exam\\英语\\Out of Control.txt"),new File("d:\\word.txt"));
}
public static void analysis(File src,File dest ){
HashMap<String,Integer> maps = new HashMap<String,Integer>();
//读取文件,并把相同的累加
String content = FileUtil.readString(src);
String[] lines = content.split("\r\n");
for(String line:lines){
line = line.toLowerCase();
line = line.replaceAll(",", " ").replaceAll("\\.", " ").replaceAll("'", " ").replaceAll("!", " ").replaceAll("\\(", " ").replaceAll("\\)", " ")
.replaceAll("\\?", " ").replaceAll("\"", " ").replaceAll(":", " ").replaceAll(";", " ");
line = line.replaceAll(" ", " ").replaceAll(" ", " ").replaceAll(" ", " ").replaceAll(" ", " ");
String[] words = line.split(" ");
//System.out.println(line);
for(String word:words){
boolean flag = false;
for(int i=0;i<word.length();i++){
if(word.charAt(i)>='0' && word.charAt(i)<='9'){
flag = true;
break;
}
}
if(flag){
continue;
}
if(maps.containsKey(word)){
Integer value = maps.get(word);
value++;
maps.put(word, value);
}else{
maps.put(word, 1);
}
}
}
List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(maps.entrySet());
Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {
return obj2.getValue() - obj1.getValue();
}
});
String ret = "";
for (int j = 0; j<info.size();j++) {
FileUtil.writeString(dest, j+" "+info.get(j).getKey()+" "+info.get(j).getValue()+"\r\n", 1);
}
//排序,输出
}
}
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。