- John likes to watch movies.
- Mary likes movies too.
- John also likes football.
Term | Index |
John | 1 |
likes | 2 |
to | 3 |
watch | 4 |
movies | 5 |
Mary | 6 |
too | 7 |
also | 8 |
football | 9 |
- /**
- *
- * @param file 文件位置
- * @return
- * @throws IOException
- */
- public static ArrayList<int[]> txt2num(String file) throws IOException {
- BufferedReader br = new BufferedReader(new FileReader(file));
- String s;
- StringBuilder sb = new StringBuilder();
- ArrayList<String> strArr = new ArrayList<String>();
- while ((s=br.readLine()) != null){
- String tmp = s.split("\\.")[0];
- strArr.add(tmp);
- sb.append(tmp+" ");
- }
- String[] split = sb.toString().split(" ");
- TreeSet<String> strHashSet = new TreeSet<>();
- for (String s1 : split) {
- strHashSet.add(s1);
- }
- ArrayList<int[]> txt2Matrix = new ArrayList<int[]>();
- System.out.println(Arrays.toString(strHashSet.toArray()));
- //填入数据
- for (String s1 : strArr) {
- int[] txt2IntVec = new int[strHashSet.size()];
- String[] ss = s1.split(" ");
- ArrayList<String > strs = new ArrayList<String>();
- for (String s2 : ss) {
- strs.add(s2);
- }
- System.out.println(Arrays.toString(ss));
- for (int i = 0; i < txt2IntVec.length; i++) {
- txt2IntVec[i] = strs.contains(strHashSet.toArray()[i]) ? 1 : 0;
- }
- System.out.println(Arrays.toString(txt2IntVec));
- txt2Matrix.add(txt2IntVec);
- }
- return txt2Matrix;
- }
- public final class Murmur3_x86_32 {
- private static final int C1 = 0xcc9e2d51;
- private static final int C2 = 0x1b873593;
- private final int seed;
- public Murmur3_x86_32(int seed) {
- this.seed = seed;
- }
- @Override
- public String toString() {
- return "Murmur3_32(seed=" + seed + ")";
- }
- public int hashInt(int input) {
- return hashInt(input, seed);
- }
- public static int hashInt(int input, int seed) {
- int k1 = mixK1(input);
- int h1 = mixH1(seed, k1);
- return fmix(h1, 4);
- }
- public int hashUnsafeWords(Object base, long offset, int lengthInBytes) {
- return hashUnsafeWords(base, offset, lengthInBytes, seed);
- }
- public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed) {
- // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method.
- assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)";
- int h1 = hashBytesByInt(base, offset, lengthInBytes, seed);
- return fmix(h1, lengthInBytes);
- }
- public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
- assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
- int lengthAligned = lengthInBytes - lengthInBytes % 4;
- int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
- for (int i = lengthAligned; i < lengthInBytes; i++) {
- int halfWord = Platform.getByte(base, offset + i);
- int k1 = mixK1(halfWord);
- h1 = mixH1(h1, k1);
- }
- return fmix(h1, lengthInBytes);
- }
- private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) {
- assert (lengthInBytes % 4 == 0);
- int h1 = seed;
- for (int i = 0; i < lengthInBytes; i += 4) {
- int halfWord = Platform.getInt(base, offset + i);
- int k1 = mixK1(halfWord);
- h1 = mixH1(h1, k1);
- }
- return h1;
- }
- public int hashLong(long input) {
- return hashLong(input, seed);
- }
- public static int hashLong(long input, int seed) {
- int low = (int) input;
- int high = (int) (input >>> 32);
- int k1 = mixK1(low);
- int h1 = mixH1(seed, k1);
- k1 = mixK1(high);
- h1 = mixH1(h1, k1);
- return fmix(h1, 8);
- }
- private static int mixK1(int k1) {
- k1 *= C1;
- k1 = Integer.rotateLeft(k1, 15);
- k1 *= C2;
- return k1;
- }
- private static int mixH1(int h1, int k1) {
- h1 ^= k1;
- h1 = Integer.rotateLeft(h1, 13);
- h1 = h1 * 5 + 0xe6546b64;
- return h1;
- }
- // Finalization mix - force all bits of a hash block to avalanche
- private static int fmix(int h1, int length) {
- h1 ^= length;
- h1 ^= h1 >>> 16;
- h1 *= 0x85ebca6b;
- h1 ^= h1 >>> 13;
- h1 *= 0xc2b2ae35;
- h1 ^= h1 >>> 16;
- return h1;
- }
- }
case class LabeledText(item:String,label:Double,doc:String)
- item:文件名字(类名)
- label:标签
- doc:从整个文本中提取的单词或者字母
- object NewClassifier {
- def listSonRoute(path: String): Seq[String] ={
- val conf = new Configuration()
- val fs = new Path(path).getFileSystem(conf)
- val status = fs.listFiles(new Path(path),true)
- var res: List[String] = Nil
- while (status.hasNext){
- res = res++Seq(status.next().getPath.toString)
- }
- res
- }
- /**
- * 提取英文单词或者字母
- * @param content
- * @return
- */
- def splitStr(content: String): List[String] =("[A-Za-z]+$".r findAllIn content).toList
- def rdd2Str(sc:SparkContext,path:String)= {
- val rdd = sc.textFile(path)
- val myAccumulator = sc.accumulator[String](" ")(StringAccumulatorParam)
- rdd.foreach{ part=> splitStr(part).foreach{ word =>
- myAccumulator.add(word.toLowerCase)
- }}
- myAccumulator.value
- }
- def getDataFromHDFS(sc:SparkContext,path:String): DataFrame ={
- val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
- listSonRoute(path).map(
- part =>
- LabeledText(part.split("/").apply(8),new Random(100).nextInt(),rdd2Str(sc,part))
- ).toDF()
- }
- def main(args: Array[String]) {
- val conf = new SparkConf().setAppName("new Classifier").setMaster("local")
- .set("spark.storage.memoryFraction", "0.1")
- val sc = new SparkContext(conf)
- // rawData to parquet
- val testPath = "hdfs://master:9000/data/studySet/textMining/20news-bydate/20news-bydate-test"
- val trainPath = "hdfs://master:9000/data/studySet/textMining/20news-bydate/20news-bydate-train/"
- val testDF = getDataFromHDFS(sc,testPath)
- val trainDF = getDataFromHDFS(sc,trainPath)
- testDF.write.save("hdfs://master:9000/data/studySet/textMining/20news-bydate/test")
- trainDF.write.save("hdfs://master:9000/data/studySet/textMining/20news-bydate/train")
- }
- }
