diff options
author | Robin H. Johnson <robbat2@gentoo.org> | 2008-03-20 17:41:03 -0700 |
---|---|---|
committer | Robin H. Johnson <robbat2@gentoo.org> | 2008-03-21 20:13:06 -0700 |
commit | 85766ce726d1971d8c49d351007a03be6010f101 (patch) | |
tree | f4ac06fad14d445a46b5283c4906f0be5e274b0e | |
parent | Add index creation script. (diff) | |
download | distindex-85766ce726d1971d8c49d351007a03be6010f101.tar.gz distindex-85766ce726d1971d8c49d351007a03be6010f101.tar.bz2 distindex-85766ce726d1971d8c49d351007a03be6010f101.zip |
Add tools to query and dump the index.
-rw-r--r-- | index-dumper.pl | 28 | ||||
-rw-r--r-- | index-query.pl | 44 |
2 files changed, 72 insertions, 0 deletions
diff --git a/index-dumper.pl b/index-dumper.pl new file mode 100644 index 0000000..542ec02 --- /dev/null +++ b/index-dumper.pl @@ -0,0 +1,28 @@ +#!/usr/bin/perl +use strict; +use warnings; + +# Lucene stuff by Robin H. Johnson <robbat2@gentoo.org> + +use Lucene; +use Data::Dumper; + +my $store = Lucene::Store::FSDirectory->getDirectory("data", 0); +my $reader = Lucene::Index::IndexReader->open($store); + +# get number of docs in index +my $num_docs = $reader->numDocs(); + +for(my $i=0;$i<$num_docs; $i++) { + # get the nth document + my $doc = $reader->document($i); + # This is missing in the Perl bindings :-( + #my $fields = $doc->fields; + # So we have to either specify a field directly + my $fields = $doc->get('md5'); + my $s = $doc->toString; + print $s."\n"; +} + +$reader->close; +undef $reader; diff --git a/index-query.pl b/index-query.pl new file mode 100644 index 0000000..8e44c25 --- /dev/null +++ b/index-query.pl @@ -0,0 +1,44 @@ +#!/usr/bin/perl +use strict; +use warnings; + +# Lucene stuff by Robin H. Johnson <robbat2@gentoo.org> + +use Lucene; +use Data::Dumper; + +my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer(); +my $store = Lucene::Store::FSDirectory->getDirectory("data", 0); +my $searcher = new Lucene::Search::IndexSearcher($store); +my $parser = new Lucene::QueryParser("filename", $analyzer); + +# The numeric range queries don't work quite as you expect +# They run as strings, not numerics presently "size:[0 TO 9000]" +# http://lucene.apache.org/java/docs/queryparsersyntax.html +my $query = $parser->parse("distfile:akode* AND filename:m* AND isdistfile:0"); +my $hits = $searcher->search($query); + +# get number of results + my $num_hits = $hits->length(); + + # get fields and ranking score for each hit + for (my $i = 0; $i < $num_hits; $i++) { + my $doc = $hits->doc($i); + my $score = $hits->score($i); + my $path = $doc->get("path"); + my $size = $doc->get("size"); + my $md5 = $doc->get("md5"); + printf "%s %s %d\n",$path,$md5,$size; + } + + # free memory and close searcher + undef $hits; + undef $query; + undef $parser; + undef $analyzer; + $searcher->close(); + undef $searcher; + undef $store; + + + |