经验首页 前端设计 程序设计 Java相关 移动开发 数据库/运维 软件/图像 大数据/云计算 其他经验
当前位置:技术经验 » 大数据/云/AI » MapReduce » 查看文章
Java实现MapReduce Wordcount案例
来源:cnblogs  作者:swineherd_MCQ  时间:2019/11/4 8:28:51  对本文有异议

先改pom.xml:

  1. <project xmlns="http://maven.apache.org/POM/4.0.0"
  2. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  4. <modelVersion>4.0.0</modelVersion>
  5. <groupId>com.mcq</groupId>
  6. <artifactId>mr-1101</artifactId>
  7. <version>0.0.1-SNAPSHOT</version>
  8. <dependencies>
  9. <dependency>
  10. <groupId>jdk.tools</groupId>
  11. <artifactId>jdk.tools</artifactId>
  12. <version>1.8</version>
  13. <scope>system</scope>
  14. <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
  15. </dependency>
  16. <dependency>
  17. <groupId>junit</groupId>
  18. <artifactId>junit</artifactId>
  19. <version>RELEASE</version>
  20. </dependency>
  21. <dependency>
  22. <groupId>org.apache.logging.log4j</groupId>
  23. <artifactId>log4j-core</artifactId>
  24. <version>2.8.2</version>
  25. </dependency>
  26. <dependency>
  27. <groupId>org.apache.hadoop</groupId>
  28. <artifactId>hadoop-common</artifactId>
  29. <version>2.7.2</version>
  30. </dependency>
  31. <dependency>
  32. <groupId>org.apache.hadoop</groupId>
  33. <artifactId>hadoop-client</artifactId>
  34. <version>2.7.2</version>
  35. </dependency>
  36. <dependency>
  37. <groupId>org.apache.hadoop</groupId>
  38. <artifactId>hadoop-hdfs</artifactId>
  39. <version>2.7.2</version>
  40. </dependency>
  41. </dependencies>
  42. </project>

在resources文件夹下添加文件 log4j.properties:

  1. log4j.rootLogger=INFO, stdout
  2. log4j.appender.stdout=org.apache.log4j.ConsoleAppender
  3. log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
  4. log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
  5. log4j.appender.logfile=org.apache.log4j.FileAppender
  6. log4j.appender.logfile.File=target/spring.log
  7. log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
  8. log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n

 

 WordcountDriver.java:

  1. package com.mcq;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.conf.Configuration;
  6. import org.apache.hadoop.fs.Path;
  7. import org.apache.hadoop.io.IntWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.apache.hadoop.mapreduce.Job;
  10. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  11. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  12.  
  13. public class WordcountDriver{
  14. public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
  15. System.out.println("hello");
  16. Configuration conf=new Configuration();
  17. //1.获取Job对象
  18. Job job=Job.getInstance(conf);
  19. //2.设置jar存储位置
  20. job.setJarByClass(WordcountDriver.class);
  21. //3.关联Map和Reduce类
  22. job.setMapperClass(WordcountMapper.class);
  23. job.setReducerClass(WordcountReducer.class);
  24. //4.设置Mapper阶段输出数据的key和value类型
  25. job.setMapOutputKeyClass(Text.class);
  26. job.setMapOutputValueClass(IntWritable.class);
  27. //5.设置最终输出的key和value类型
  28. job.setOutputKeyClass(Text.class);
  29. job.setOutputValueClass(IntWritable.class);
  30. //6.设置输入路径和输出路径
  31. FileInputFormat.setInputPaths(job, new Path(args[0]));
  32. FileOutputFormat.setOutputPath(job, new Path(args[1]));
  33. //7.提交Job
  34. // job.submit();
  35. job.waitForCompletion(true);
  36. // boolean res=job.waitForCompletion(true);//true表示打印结果
  37. // System.exit(res?0:1);
  38. }
  39. }

 WordcountMapper.java:

  1. package com.mcq;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.io.IntWritable;
  6. import org.apache.hadoop.io.LongWritable;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.mapreduce.Mapper;
  9.  
  10. //map阶段
  11. //KEYIN:输入数据的key(偏移量,比如第一行是0~19,第二行是20~25),必须是LongWritable
  12. //VALUEIN:输入数据的value(比如文本内容是字符串,那就填Text)
  13. //KEYOUT:输出数据的key类型
  14. //VALUEOUT:输出数据的值类型
  15. public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
  16. IntWritable v=new IntWritable(1);
  17. Text k = new Text();
  18. @Override
  19. protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
  20. throws IOException, InterruptedException {
  21. // TODO Auto-generated method stub
  22. //1.获取一行
  23. String line=value.toString();
  24. //2.切割单词
  25. String[] words=line.split(" ");
  26. //3.循环写出
  27. for(String word:words) {
  28. k.set(word);
  29. context.write(k, v);
  30. }
  31. }
  32. }

 WordcountReducer.java:

  1. package com.mcq;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.apache.hadoop.io.IntWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.Reducer;
  8.  
  9. //KEYIN、VALUEIN:map阶段输出的key和value类型
  10. public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
  11. IntWritable v=new IntWritable();
  12. @Override
  13. protected void reduce(Text key, Iterable<IntWritable> values,
  14. Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
  15. // TODO Auto-generated method stub
  16. int sum=0;
  17. for(IntWritable value:values) {
  18. sum+=value.get();
  19. }
  20. v.set(sum);
  21. context.write(key, v);
  22. }
  23. }

在run configuration里加上参数e:/mrtest/in.txt e:/mrtest/out.txt

 

 

运行时遇到了个bug,参考https://blog.csdn.net/qq_40310148/article/details/86617512解决了

 

在集群上运行:

用maven打成jar包,需要添加一些打包依赖:

  1. <build>
  2. <plugins>
  3. <plugin>
  4. <artifactId>maven-compiler-plugin</artifactId>
  5. <version>2.3.2</version>
  6. <configuration>
  7. <source>1.8</source>
  8. <target>1.8</target>
  9. </configuration>
  10. </plugin>
  11. <plugin>
  12. <artifactId>maven-assembly-plugin </artifactId>
  13. <configuration>
  14. <descriptorRefs>
  15. <descriptorRef>jar-with-dependencies</descriptorRef>
  16. </descriptorRefs>
  17. <archive>
  18. <manifest>
  19. <mainClass>com.mcq.WordcountDriver</mainClass>
  20. </manifest>
  21. </archive>
  22. </configuration>
  23. <executions>
  24. <execution>
  25. <id>make-assembly</id>
  26. <phase>package</phase>
  27. <goals>
  28. <goal>single</goal>
  29. </goals>
  30. </execution>
  31. </executions>
  32. </plugin>
  33. </plugins>
  34. </build>

 注意上面mainClass里要填驱动类的主类名,可以点击类名右键copy qualified name。

将程序打成jar包(具体操作:右键工程名run as maven install,然后target文件夹会产生两个jar包,我们把不用依赖的包拷贝到hadoop集群上,因为集群已经配好相关依赖了),上传到集群

输入以下命令运行

hadoop jar mr-1101-0.0.1-SNAPSHOT.jar com.mcq.WordcountDriver /xiaocao.txt /output

注意这里输入输出的路径是集群上的路径。

原文链接:http://www.cnblogs.com/mcq1999/p/11780758.html

 友情链接:直通硅谷  点职佳  北美留学生论坛

本站QQ群:前端 618073944 | Java 606181507 | Python 626812652 | C/C++ 612253063 | 微信 634508462 | 苹果 692586424 | C#/.net 182808419 | PHP 305140648 | 运维 608723728

W3xue 的所有内容仅供测试,对任何法律问题及风险不承担任何责任。通过使用本站内容随之而来的风险与本站无关。
关于我们  |  意见建议  |  捐助我们  |  报错有奖  |  广告合作、友情链接(目前9元/月)请联系QQ:27243702 沸活量
皖ICP备17017327号-2 皖公网安备34020702000426号