一,准备工作:eclipse 安装hadoop 插件:
新建mapreduce project
map 用于分词,reduce计数。
package tank.demo;import;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import;import;import;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * @author tank * @date:2015年1月5日 上午10:03:43 * @description:记词器 * @version :0.1 */public class WordCount { public static class TokenizerMapper extends Mapper{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length != 2) { System.err.println("Usage: wordcount "); System.exit(2); } Job job = new Job(conf, "word count"); //主类 job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(IntSumReducer.class); //map输出格式 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //输出格式 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }}
hadoop fs -mkdir /user/hadoop/input//建好输入目录
echo hello my hadoop this is my first application>file1
echo hello world my deer my applicaiton >file2
hadoop fs -put file* /user/hadoop/input
hadoop fs -ls /user/hadoop/input //查看
hadoop jar world-count.jar WordCount input output
15/01/05 11:14:36 INFO mapred.Task: Task:attempt_local1938802295_0001_r_000000_0 is done. And is in the process of committing
15/01/05 11:14:36 INFO mapred.LocalJobRunner: 15/01/05 11:14:36 INFO mapred.Task: Task attempt_local1938802295_0001_r_000000_0 is allowed to commit now15/01/05 11:14:36 INFO output.FileOutputCommitter: Saved output of task 'attempt_local1938802295_0001_r_000000_0' to hdfs:// 11:14:36 INFO mapred.LocalJobRunner: reduce > reduce15/01/05 11:14:36 INFO mapred.Task: Task 'attempt_local1938802295_0001_r_000000_0' done.15/01/05 11:14:36 INFO mapreduce.Job: Job job_local1938802295_0001 running in uber mode : false15/01/05 11:14:36 INFO mapreduce.Job: map 100% reduce 100%15/01/05 11:14:36 INFO mapreduce.Job: Job job_local1938802295_0001 completed successfully15/01/05 11:14:36 INFO mapreduce.Job: Counters: 32 File System Counters FILE: Number of bytes read=17706 FILE: Number of bytes written=597506 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=205 HDFS: Number of bytes written=85 HDFS: Number of read operations=25 HDFS: Number of large read operations=0 HDFS: Number of write operations=5 Map-Reduce Framework Map input records=2 Map output records=14 Map output bytes=136 Map output materialized bytes=176 Input split bytes=232 Combine input records=0 Combine output records=0 Reduce input groups=10 Reduce shuffle bytes=0 Reduce input records=14 Reduce output records=10 Spilled Records=28 Shuffled Maps =0 Failed Shuffles=0 Merged Map outputs=0 GC time elapsed (ms)=67 CPU time spent (ms)=0 Physical memory (bytes) snapshot=0 Virtual memory (bytes) snapshot=0 Total committed heap usage (bytes)=456536064 File Input Format Counters Bytes Read=80 File Output Format Counters Bytes Written=85查看输出目录下的文件
[hadoop@tank1 ~]$ hadoop fs -cat /user/hadoop/output/part-r-00000
applicaiton 1application 1deer 1first 1hadoop 1hello 2is 1my 4this 1world 1已经正确统计出单词数量!