Build the Hadoop development environment using Maven

2020-05-10 18:13:40
OfStack

The usage of Maven is no longer worded, there is a lot on the Internet, and it has not changed much over the years. Here is only how to set up the development environment of Hadoop.

1. Create the project first

mvn archetype:generate -DgroupId=my.hadoopstudy -DartifactId=hadoopstudy -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false

2. Then add hadoop-common, hadoop-client, hadoop-hdfs to pom.xml. The pom.xml file is as follows


<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 <modelVersion>4.0.0</modelVersion>
 <groupId>my.hadoopstudy</groupId>
 <artifactId>hadoopstudy</artifactId>
 <packaging>jar</packaging>
 <version>1.0-SNAPSHOT</version>
 <name>hadoopstudy</name>
 <url>http://maven.apache.org</url>

 <dependencies>
 <dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-common</artifactId>
  <version>2.5.1</version>
 </dependency>
 <dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-hdfs</artifactId>
  <version>2.5.1</version>
 </dependency>
 <dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-client</artifactId>
  <version>2.5.1</version>
 </dependency>

 <dependency>
  <groupId>junit</groupId>
  <artifactId>junit</artifactId>
  <version>3.8.1</version>
  <scope>test</scope>
 </dependency>
 </dependencies>
</project>

3. The test

3.1 first we can test the development of hdfs under 1. Here we assume the hadoop cluster from the last Hadoop article is used. The class code is as follows


package my.hadoopstudy.dfs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

import java.io.InputStream;
import java.net.URI;

public class Test {
 public static void main(String[] args) throws Exception {
 String uri = "hdfs://9.111.254.189:9000/";
 Configuration config = new Configuration();
 FileSystem fs = FileSystem.get(URI.create(uri), config);

 //  list hdfs on /user/fkong/ All files and directories in the directory 
 FileStatus[] statuses = fs.listStatus(new Path("/user/fkong"));
 for (FileStatus status : statuses) {
  System.out.println(status);
 }

 //  in hdfs the /user/fkong Create under directory 1 Two files, and write 1 Line of text 
 FSDataOutputStream os = fs.create(new Path("/user/fkong/test.log"));
 os.write("Hello World!".getBytes());
 os.flush();
 os.close();

 //  Displayed in the hdfs the /user/fkong Specify the contents of the file below 
 InputStream is = fs.open(new Path("/user/fkong/test.log"));
 IOUtils.copyBytes(is, System.out, 1024, true);
 }
}

3.2 test the MapReduce job

The test code is relatively simple, as follows:


package my.hadoopstudy.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;

public class EventCount {

 public static class MyMapper extends Mapper<Object, Text, Text, IntWritable>{
 private final static IntWritable one = new IntWritable(1);
 private Text event = new Text();

 public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  int idx = value.toString().indexOf(" ");
  if (idx > 0) {
  String e = value.toString().substring(0, idx);
  event.set(e);
  context.write(event, one);
  }
 }
 }

 public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
 private IntWritable result = new IntWritable();

 public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
  int sum = 0;
  for (IntWritable val : values) {
  sum += val.get();
  }
  result.set(sum);
  context.write(key, result);
 }
 }

 public static void main(String[] args) throws Exception {
 Configuration conf = new Configuration();
 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
 if (otherArgs.length < 2) {
  System.err.println("Usage: EventCount <in> <out>");
  System.exit(2);
 }
 Job job = Job.getInstance(conf, "event count");
 job.setJarByClass(EventCount.class);
 job.setMapperClass(MyMapper.class);
 job.setCombinerClass(MyReducer.class);
 job.setReducerClass(MyReducer.class);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(IntWritable.class);
 FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
 System.exit(job.waitForCompletion(true) ? 0 : 1);
 }
}

Run the "mvn package" command to generate the jar package hadoopstudy-1.0-SNAPSHOT.jar, and copy the jar file to the hadoop installation directory

This assumes that we need to analyze the Event information in several log files to count the various Event Numbers, so create 1 directory and file

/tmp/input/event.log.1
/tmp/input/event.log.2
/tmp/input/event.log.3

Because here just want to do 1 column child, so each file content can be 1 kind, if the content is as follows

JOB_NEW ...
JOB_NEW ...
JOB_FINISH ...
JOB_NEW ...
JOB_FINISH ...

Then copy these files to HDFS

$ bin/hdfs dfs -put /tmp/input /user/fkong/input

Run the mapreduce job

$ bin/hadoop jar hadoopstudy-1.0-SNAPSHOT.jar my.hadoopstudy.mapreduce.EventCount /user/fkong/input /user/fkong/output

View execution results

$ bin/hdfs dfs -cat /user/fkong/output/part-r-00000