import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
public class ContentGetter extends Configured {
private FileSystem fs;
// public ContentGetter() { super(null); }
public ContentGetter(Configuration conf) {
super(conf);
try {
this.fs = FileSystem.get(getConf());
} catch (IOException e) {
e.printStackTrace();
}
}
public byte[] getContent(final Path segment, final Text key) throws Exception {
Path dir = new Path(segment, Content.DIR_NAME);
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf());
if (readers == null || readers.length != 1) {
return null;
}
Content value = new Content();
readers[0].get(key, value);
readers[0].close();
return value.getContent();
}
public static void main(String[] args) throws Exception {
if (args.length < 2) {
usage();
return;
}
String input = args[0];
String key = args[1];
//
Configuration conf = NutchConfiguration.create();
ContentGetter contentGetter = new ContentGetter(conf);
byte[] content = contentGetter.getContent(new Path(input), new Text(key));
System.out.write(content);
System.out.flush();
return;
}
private static void usage() {
System.err.println("Usage: ContentGetter segment url");
}
}
|