$ bin/nutch readseg -dump crawl-2/segments/20070420234414 dump-2
$ grep '<a name="item' dump-2/dump | sed 's/^.*item//' | sed 's/\".*$//' | sort -n > entry.txt
$ echo 74410 >> entry.txt
$ echo 74600 >> entry.txt
$ cat entry.txt | sed 's/^/http:\/\/kazuomik.livejournal.com\//' | sed 's/$/.html/' > urls-kazmuzikblog/nutch
$ cat urls-kazmuzikblog/nutch
http://kazuomik.livejournal.com/495.html
http://kazuomik.livejournal.com/722.html
http://kazuomik.livejournal.com/1008.html
...
http://kazuomik.livejournal.com/73761.html
http://kazuomik.livejournal.com/74000.html
http://kazuomik.livejournal.com/74410.html
http://kazuomik.livejournal.com/74600.html
$ vi conf/crawl-urlfilter.txt
+^http://kazuomik.livejournal.com/[1-9][0-9]*.html$
-.
$ bin/nutch crawl urls-kazmuzikblog -dir crawl-3 -depth 1
...
$ bin/nutch readseg -list -dir crawl-3/segments
NAME GENERATED FETCHER START FETCHER END FETCHED PARSED
20070421201803 291 2007-04-21T20:18:09 2007-04-21T20:25:27 291 291
$
|