Sample program with Hadoop Counters and Distributed Cache

2015-04-20T05:56:34-05:00

The method getLocalCacheFiles() is undefined for the type Mapper.Context

i tried to compile this code bt i found the above error before compiling only… could u please rply me what shoulb be done to resolve this error.

Reply

2015-04-25T07:09:28-05:00

Which version and distribution of hadoop are you using.?

Reply

2016-07-14T18:50:02-05:00

Thanks for the excellent write up

Reply

2017-02-24T10:32:52-05:00

While adding multiple Paths to DistributedCache in main function. There should be while statement rather than if
if (“-skip”.equals(args[i])) –> while (“-skip”.equals(args[i]))

Reply

2017-03-02T23:00:31-05:00

Thanks for the comment. The if condition worked for my requirement.

Reply

	package com.hadoop.skipper;

	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.HashSet;
	import java.util.Set;
	import java.util.StringTokenizer;

	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;

	public class SkipMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

	private Text word = new Text();
	private Set<String> stopWordList = new HashSet<String>();
	private BufferedReader fis;

	/*
	* (non-Javadoc)
	*
	* @see
	* org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.
	* Mapper.Context)
	*/
	@SuppressWarnings("deprecation")
	protected void setup(Context context) throws java.io.IOException,
	InterruptedException {

	try {

	Path[] stopWordFiles = new Path[0];
	stopWordFiles = context.getLocalCacheFiles();
	System.out.println(stopWordFiles.toString());
	if (stopWordFiles != null && stopWordFiles.length > 0) {
	for (Path stopWordFile : stopWordFiles) {
	readStopWordFile(stopWordFile);
	}
	}
	} catch (IOException e) {
	System.err.println("Exception reading stop word file: " + e);

	}

	}

	/*
	* Method to read the stop word file and get the stop words
	*/

	private void readStopWordFile(Path stopWordFile) {
	try {
	fis = new BufferedReader(new FileReader(stopWordFile.toString()));
	String stopWord = null;
	while ((stopWord = fis.readLine()) != null) {
	stopWordList.add(stopWord);
	}
	} catch (IOException ioe) {
	System.err.println("Exception while reading stop word file '"
	+ stopWordFile + "' : " + ioe.toString());
	}
	}

	/*
	* (non-Javadoc)
	*
	* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN,
	* org.apache.hadoop.mapreduce.Mapper.Context)
	*/

	public void map(LongWritable key, Text value, Context context)
	throws IOException, InterruptedException {
	String line = value.toString();
	StringTokenizer tokenizer = new StringTokenizer(line);

	while (tokenizer.hasMoreTokens()) {
	String token = tokenizer.nextToken();
	if (stopWordList.contains(token)) {
	context.getCounter(StopWordSkipper.COUNTERS.STOPWORDS)
	.increment(1L);
	} else {
	context.getCounter(StopWordSkipper.COUNTERS.GOODWORDS)
	.increment(1L);
	word.set(token);
	context.write(word, null);
	}
	}
	}
	}

	package com.hadoop.skipper;

	import java.util.ArrayList;
	import java.util.List;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.filecache.DistributedCache;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Counters;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
	import org.apache.hadoop.util.GenericOptionsParser;

	@SuppressWarnings("deprecation")
	public class StopWordSkipper {

	public enum COUNTERS {
	STOPWORDS,
	GOODWORDS
	}
	public static void main(String[] args) throws Exception {

	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	args = parser.getRemainingArgs();

	Job job = new Job(conf, "StopWordSkipper");
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	job.setJarByClass(StopWordSkipper.class);
	job.setMapperClass(SkipMapper.class);
	job.setNumReduceTasks(0);
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	List<String> other_args = new ArrayList<String>();
	// Logic to read the location of stop word file from the command line
	// The argument after -skip option will be taken as the location of stop
	// word file

	for (int i = 0; i < args.length; i++) {
	if ("-skip".equals(args[i])) {
	DistributedCache.addCacheFile(new Path(args[++i]).toUri(),
	job.getConfiguration());
	if (i+1 < args.length)
	{
	i++;
	}
	else
	{
	break;
	}
	}

	other_args.add(args[i]);
	}

	FileInputFormat.setInputPaths(job, new Path(other_args.get(0)));
	FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
	job.waitForCompletion(true);
	Counters counters = job.getCounters();
	System.out.printf("Good Words: %d, Stop Words: %d\n",
	counters.findCounter(COUNTERS.GOODWORDS).getValue(),
	counters.findCounter(COUNTERS.STOPWORDS).getValue());
	}
	}

All About Tech

Victory goes to the player who makes the next-to-last mistake

Sample program with Hadoop Counters and Distributed Cache

5 thoughts on “Sample program with Hadoop Counters and Distributed Cache”

Leave a comment Cancel reply

Share this:

Related

5 thoughts on “Sample program with Hadoop Counters and Distributed Cache”

Leave a comment Cancel reply