当前位置:   article > 正文

[头哥实践平台]MapReduce基础实战_第3关:信息挖掘 - 挖掘父子关系

第3关:信息挖掘 - 挖掘父子关系

一. 第1关:成绩统计

编程要求

使用MapReduce计算班级每个学生的最好成绩,输入文件路径为/user/test/input,请将计算后的结果输出到/user/test/output/目录下。

先写命令行,如下:
一行就是一个命令

  1. touch file01
  2. echo Hello World Bye World
  3. cat file01
  4. echo Hello World Bye World >file01
  5. cat file01
  6. touch file02
  7. echo Hello Hadoop Goodbye Hadoop >file02
  8. cat file02
  9. start-dfs.sh
  10. hadoop fs -mkdir /usr
  11. hadoop fs -mkdir /usr/input
  12. hadoop fs -ls /usr/output
  13. hadoop fs -ls /
  14. hadoop fs -ls /usr
  15. hadoop fs -put file01 /usr/input
  16. hadoop fs -put file02 /usr/input
  17. hadoop fs -ls /usr/input

代码段部分:

  1. import java.util.StringTokenizer;
  2. import java.io.IOException;
  3. import java.util.StringTokenizer;
  4. import org.apache.hadoop.conf.Configuration;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.*;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.mapreduce.Job;
  9. import org.apache.hadoop.mapreduce.Mapper;
  10. import org.apache.hadoop.mapreduce.Reducer;
  11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  13. import org.apache.hadoop.util.GenericOptionsParser;
  14. public class WordCount {
  15. /********** Begin **********/
  16. //Mapper函数
  17. public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  18. private final static IntWritable one = new IntWritable(1);
  19. private Text word = new Text();
  20. private int maxValue = 0;
  21. public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  22. StringTokenizer itr = new StringTokenizer(value.toString(),"\n");
  23. while (itr.hasMoreTokens()) {
  24. String[] str = itr.nextToken().split(" ");
  25. String name = str[0];
  26. one.set(Integer.parseInt(str[1]));
  27. word.set(name);
  28. context.write(word,one);
  29. }
  30. //context.write(word,one);
  31. }
  32. }
  33. public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
  34. private IntWritable result = new IntWritable();
  35. public void reduce(Text key, Iterable<IntWritable> values, Context context)
  36. throws IOException, InterruptedException {
  37. int maxAge = 0;
  38. int age = 0;
  39. for (IntWritable intWritable : values) {
  40. maxAge = Math.max(maxAge, intWritable.get());
  41. }
  42. result.set(maxAge);
  43. context.write(key, result);
  44. }
  45. }
  46. public static void main(String[] args) throws Exception {
  47. Configuration conf = new Configuration();
  48. Job job = new Job(conf, "word count");
  49. job.setJarByClass(WordCount.class);
  50. job.setMapperClass(TokenizerMapper.class);
  51. job.setCombinerClass(IntSumReducer.class);
  52. job.setReducerClass(IntSumReducer.class);
  53. job.setOutputKeyClass(Text.class);
  54. job.setOutputValueClass(IntWritable.class);
  55. String inputfile = "/user/test/input";
  56. String outputFile = "/user/test/output/";
  57. FileInputFormat.addInputPath(job, new Path(inputfile));
  58. FileOutputFormat.setOutputPath(job, new Path(outputFile));
  59. job.waitForCompletion(true);
  60. /********** End **********/
  61. }
  62. }

二. 第2关:文件内容合并去重

编程要求
接下来我们通过一个练习来巩固学习到的MapReduce知识吧。

对于两个输入文件,即文件file1和文件file2,请编写MapReduce程序,对两个文件进行合并,并剔除其中重复的内容,得到一个新的输出文件file3。
为了完成文件合并去重的任务,你编写的程序要能将含有重复内容的不同文件合并到一个没有重复的整合文件,规则如下:

第一列按学号排列;
学号相同,按x,y,z排列;
输入文件路径为:/user/tmp/input/;
输出路径为:/user/tmp/output/。
注意:输入文件后台已经帮你创建好了,不需要你再重复创建。

请先启动Hadoop再点击评测!
所以要先在命令行输入下面启动命令

  1. start-dfs.sh
  1. import java.io.IOException;
  2. import java.util.*;
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.fs.Path;
  5. import org.apache.hadoop.io.*;
  6. import org.apache.hadoop.mapreduce.Job;
  7. import org.apache.hadoop.mapreduce.Mapper;
  8. import org.apache.hadoop.mapreduce.Reducer;
  9. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  10. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  11. import org.apache.hadoop.util.GenericOptionsParser;
  12. public class Merge {
  13. /**
  14. * @param args
  15. * 对A,B两个文件进行合并,并剔除其中重复的内容,得到一个新的输出文件C
  16. */
  17. //在这重载map函数,直接将输入中的value复制到输出数据的key上 注意在map方法中要抛出异常:throws IOException,InterruptedException
  18. public static class Map extends Mapper<Object, Text, Text, Text>{
  19. /********** Begin **********/
  20. public void map(Object key, Text value, Context content)
  21. throws IOException, InterruptedException {
  22. Text text1 = new Text();
  23. Text text2 = new Text();
  24. StringTokenizer itr = new StringTokenizer(value.toString());
  25. while (itr.hasMoreTokens()) {
  26. text1.set(itr.nextToken());
  27. text2.set(itr.nextToken());
  28. content.write(text1, text2);
  29. }
  30. }
  31. /********** End **********/
  32. }
  33. //在这重载reduce函数,直接将输入中的key复制到输出数据的key上 注意在reduce方法上要抛出异常:throws IOException,InterruptedException
  34. public static class Reduce extends Reducer<Text, Text, Text, Text> {
  35. /********** Begin **********/
  36. public void reduce(Text key, Iterable<Text> values, Context context)
  37. throws IOException, InterruptedException {
  38. Set<String> set = new TreeSet<String>();
  39. for(Text tex : values){
  40. set.add(tex.toString());
  41. }
  42. for(String tex : set){
  43. context.write(key, new Text(tex));
  44. }
  45. }
  46. /********** End **********/
  47. }
  48. public static void main(String[] args) throws Exception{
  49. // TODO Auto-generated method stub
  50. Configuration conf = new Configuration();
  51. conf.set("fs.default.name","hdfs://localhost:9000");
  52. Job job = Job.getInstance(conf,"Merge and duplicate removal");
  53. job.setJarByClass(Merge.class);
  54. job.setMapperClass(Map.class);
  55. job.setCombinerClass(Reduce.class);
  56. job.setReducerClass(Reduce.class);
  57. job.setOutputKeyClass(Text.class);
  58. job.setOutputValueClass(Text.class);
  59. String inputPath = "/user/tmp/input/"; //在这里设置输入路径
  60. String outputPath = "/user/tmp/output/"; //在这里设置输出路径
  61. FileInputFormat.addInputPath(job, new Path(inputPath));
  62. FileOutputFormat.setOutputPath(job, new Path(outputPath));
  63. System.exit(job.waitForCompletion(true) ? 0 : 1);
  64. }
  65. }

三. 第3关:信息挖掘 - 挖掘父子关系
编程要求
你编写的程序要能挖掘父子辈关系,给出祖孙辈关系的表格。规则如下:

孙子在前,祖父在后;
输入文件路径:/user/reduce/input;
输出文件路径:/user/reduce/output。

请先启动Hadoop再点击评测!
所以要先在命令行输入下面启动命令

  1. start-dfs.sh
  1. import java.io.IOException;
  2. import java.util.*;
  3. import org.apache.hadoop.conf.Configuration;
  4. import org.apache.hadoop.fs.Path;
  5. import org.apache.hadoop.io.IntWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.Job;
  8. import org.apache.hadoop.mapreduce.Mapper;
  9. import org.apache.hadoop.mapreduce.Reducer;
  10. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  11. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  12. import org.apache.hadoop.util.GenericOptionsParser;
  13. public class simple_data_mining {
  14. public static int time = 0;
  15. /**
  16. * @param args
  17. * 输入一个child-parent的表格
  18. * 输出一个体现grandchild-grandparent关系的表格
  19. */
  20. //Map将输入文件按照空格分割成child和parent,然后正序输出一次作为右表,反序输出一次作为左表,需要注意的是在输出的value中必须加上左右表区别标志
  21. public static class Map extends Mapper<Object, Text, Text, Text>{
  22. public void map(Object key, Text value, Context context) throws IOException,InterruptedException{
  23. /********** Begin **********/
  24. String line = value.toString();
  25. String[] childAndParent = line.split(" ");
  26. List<String> list = new ArrayList<>(2);
  27. for (String childOrParent : childAndParent) {
  28. if (!"".equals(childOrParent)) {
  29. list.add(childOrParent);
  30. }
  31. }
  32. if (!"child".equals(list.get(0))) {
  33. String childName = list.get(0);
  34. String parentName = list.get(1);
  35. String relationType = "1";
  36. context.write(new Text(parentName), new Text(relationType + "+"
  37. + childName + "+" + parentName));
  38. relationType = "2";
  39. context.write(new Text(childName), new Text(relationType + "+"
  40. + childName + "+" + parentName));
  41. }
  42. /********** End **********/
  43. }
  44. }
  45. public static class Reduce extends Reducer<Text, Text, Text, Text>{
  46. public void reduce(Text key, Iterable<Text> values,Context context) throws IOException,InterruptedException{
  47. /********** Begin **********/
  48. //输出表头
  49. if (time == 0) {
  50. context.write(new Text("grand_child"), new Text("grand_parent"));
  51. time++;
  52. }
  53. //获取value-list中value的child
  54. List<String> grandChild = new ArrayList<>();
  55. //获取value-list中value的parent
  56. List<String> grandParent = new ArrayList<>();
  57. //左表,取出child放入grand_child
  58. for (Text text : values) {
  59. String s = text.toString();
  60. String[] relation = s.split("\\+");
  61. String relationType = relation[0];
  62. String childName = relation[1];
  63. String parentName = relation[2];
  64. if ("1".equals(relationType)) {
  65. grandChild.add(childName);
  66. } else {
  67. grandParent.add(parentName);
  68. }
  69. }
  70. //右表,取出parent放入grand_parent
  71. int grandParentNum = grandParent.size();
  72. int grandChildNum = grandChild.size();
  73. if (grandParentNum != 0 && grandChildNum != 0) {
  74. for (int m = 0; m < grandChildNum; m++) {
  75. for (int n = 0; n < grandParentNum; n++) {
  76. //输出结果
  77. context.write(new Text(grandChild.get(m)), new Text(
  78. grandParent.get(n)));
  79. }
  80. }
  81. }
  82. /********** End **********/
  83. }
  84. }
  85. public static void main(String[] args) throws Exception{
  86. // TODO Auto-generated method stub
  87. Configuration conf = new Configuration();
  88. Job job = Job.getInstance(conf,"Single table join");
  89. job.setJarByClass(simple_data_mining.class);
  90. job.setMapperClass(Map.class);
  91. job.setReducerClass(Reduce.class);
  92. job.setOutputKeyClass(Text.class);
  93. job.setOutputValueClass(Text.class);
  94. String inputPath = "/user/reduce/input"; //设置输入路径
  95. String outputPath = "/user/reduce/output"; //设置输出路径
  96. FileInputFormat.addInputPath(job, new Path(inputPath));
  97. FileOutputFormat.setOutputPath(job, new Path(outputPath));
  98. System.exit(job.waitForCompletion(true) ? 0 : 1);
  99. }
  100. }

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/一键难忘520/article/detail/780769
推荐阅读
相关标签
  

闽ICP备14008679号