import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ByteArrayInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.UTF8;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.protocol.Content;
public final class LiveJournalEntryExtractor {
public static void main(String argv[]) throws Exception {
String usage = "LiveJournalEntryExtractor segment";
if (argv.length < 1) {
System.out.println("usage:" + usage);
return;
}
Configuration conf = NutchConfiguration.create();
FileSystem fs = FileSystem.get(conf);
String segment = argv[0];
try {
Path file = new Path(segment, Content.DIR_NAME + "/part-00000/data");
SequenceFile.Reader contents = new SequenceFile.Reader(fs, file, conf);
while (true) {
Text key = new Text();
Content content = new Content();
if (! contents.next(key, content)) {
break;
}
printEntries(key, content);
}
contents.close();
} finally {
fs.close();
}
}
private static void printEntries(Text key, Content content)
throws IOException {
System.out.println("<!-- " + key + " -->");
LiveJournalEntryFilterReader in
= new LiveJournalEntryFilterReader(
new InputStreamReader(
new ByteArrayInputStream(content.getContent()), "UTF-8"));
while (true) {
String line = in.readLine();
if (line == null) {
break;
}
System.out.println(line);
}
}
}
|