#!perl
    eval 'exec perl -S $0 "$@"'
	if 0;
#                              -*- Mode: Perl -*- 
# $Basename: perlindex.PL $
# $Revision: 1.4 $
# Author          : Ulrich Pfeifer
# Created On      : Mon Jan 22 13:00:41 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Wed Jun 18 19:43:36 2003
# Language        : Perl
# Update Count    : 302
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# 
# 
# %SEEN is used to store the absolute pathes to files which have been
#       indexed. Probably this could be replaced by %FN.
# 
# %FN   $FN{'last'}    greatest documentid
#       $FN{$did}      a pair of $mtf and $filename where $mtf is the
#                      number of occurances of the most frequent word in
#                      the document with number $did.
# 
# %IDF  $IDF{'*all*'}  number of documents (essentially the same as 
#                      $FN{'last'})
#       $IDF{$word}    number of documents containing $word
# 
# %IF   $IF{$word}     list of pairs ($docid,$tf) where $docid is
#                      the number of a document containing $word $tf
# 
use Fcntl;
use less 'time';
use Getopt::Long;
use File::Basename;
use Text::English;

# NDBM_File as LAST resort
package AnyDBM_File;

@ISA = qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File) unless @ISA;

for $mod (@ISA) {
    last if eval "require $mod"
}

package main;

$p          = 'w'; # compressed int patch available
$nroff      = '';
$man1direxp = 'D:\Perl\man\man1';
$man3direxp = 'D:\Perl\man\man3';
@perllib    = 'D:\Perl\lib';
push @perllib,'D:\Perl\site\lib';
$prefix     = 'D:\Perl';
$pager      = 'more /e';
use Term::ReadKey;

$stemmer = \&Text::English::stem;
# directory for the index 
$IDIR = $man1direxp;
$IDIR =~ s:/[^/]*$::;

$opt_index   = '';                # make perl -w happy
$opt_menu    = 1;
$opt_maxhits = 15;
$opt_cbreak  = 1;
&GetOptions(
            'index',
            'cbreak!',
            'maxhits=i',
            'menu!',
            'verbose',
            'dict:i',
            'idir=s',
            ) || die "Usage: $0 [-index] [words ...]\n";

if (defined $opt_idir) {
    $IDIR = $opt_idir;          # avoid to many changes below.
}

if (defined $opt_dict) {
    $opt_dict ||= 100;
}

if ($opt_index) {
    &initstop;

    tie (%IF,   AnyDBM_File, "$IDIR/index_if",   O_CREAT|O_RDWR, 0644)
        or die "Could not tie $IDIR/index_if: $!\n";
    tie (%IDF,  AnyDBM_File, "$IDIR/index_idf",  O_CREAT|O_RDWR, 0644)
        or die "Could not tie $IDIR/index_idf: $!\n";
    tie (%SEEN, AnyDBM_File, "$IDIR/index_seen", O_CREAT|O_RDWR, 0644)
        or die "Could not tie $IDIR/index_seen: $!\n";
    tie (%FN,   AnyDBM_File, "$IDIR/index_fn",   O_CREAT|O_RDWR, 0644)
        or die "Could not tie $IDIR/index_fn: $!\n";

    require "find.pl";

    unless (@ARGV) {            # breaks compatibility :-(
        for $dir (@perllib) {
            print "Scanning $dir ... \n";
            &find($dir);
        }
    }
    for $name (@ARGV) {
        my $fns = $name;
        $fns =~ s:$prefix/::;
        next if $SEEN{$fns};
        next unless -f $name;
        if ($name !~ /(~|,v)$/) {
            $did = $FN{'last'}++;
            $SEEN{$fns} = &index($name, $fns, $did); 
        }
    }
    untie %IF;
    untie %IDF;
    untie %FN;
    untie %SEEN;
} elsif ($opt_dict) {
    tie (%IDF,  AnyDBM_File, "$IDIR/index_idf",  O_RDONLY, 0644)
        or die "Could not tie $IDIR/index_idf: $!\n".
            "Did you run '$0 -index'?\n";
    while (($key,$val) = each %IDF) {
        printf "%-20s %d\n", $key, $val if $val >= $opt_dict;
    }
    untie %IDF;
} else {
    tie (%IF,   AnyDBM_File, "$IDIR/index_if",   O_RDONLY, 0644)
        or die "Could not tie $IDIR/index_if: $!\n".
            "Did you run '$0 -index'?\n";
    tie (%IDF,  AnyDBM_File, "$IDIR/index_idf",   O_RDONLY, 0644)
        or die "Could not tie $IDIR/index_idf: $!\n";
    tie (%FN,   AnyDBM_File, "$IDIR/index_fn",   O_RDONLY, 0644)
        or die "Could not tie $IDIR/index_fn: $!\n";
    &search(@ARGV);
    untie %IF;
    untie %IDF;
    untie %FN;
    untie %SEEN;
}

sub wanted {
    my $fns = $name;

    if ($name eq $man3direxp) {
        $prune = 1;
    }
    $fns =~ s:$prefix/::;
    return if $SEEN{$fns};
    return unless -f $_;
    if ($name =~ /man|bin|\.(pod|pm)$/) {
        if (!/(~|,v)$/) {
            $did = $FN{'last'}++;
            $SEEN{$fns} = &index($name, $fns, $did); 
        }
    }
}

sub index {
    my $fn  = shift;
    my $fns = shift;
    my $did = shift;
    my %tf;
    my $maxtf = 0;
    my $pod = ($fns =~ /\.pod|man/);

    open(IN, "<$fn") || warn "Could not open $fn: $!\n", return (0);
    while ($line = <IN>) {
        if ($line =~ /^=head/) {
            $pod = 1;
        } elsif ($line =~ /^=cut/){
            $pod = 0;
        } else {
            next unless $pod;
        }
        for $word (&normalize($line)) {
            next if $stop{$word};
            $tf{$word}++;
        }
    }
    for $tf (values %tf) {
        $maxtf = $tf if $tf > $maxtf;
    }
    for $word (keys %tf) {
        $IDF{$word}++;
        $IF{$word} = '' unless defined $IF{$word}; # perl -w
        $IF{$word} .= pack($p.$p, $did, $tf{$word});
    }
    close(IN);
    $FN{$did} = pack($p, $maxtf).$fns; 
    print STDERR "$fns\n";
    1;
}

sub normalize {
    my $line = join ' ', @_;
    my @result;

    $line =~ tr/A-Z/a-z/;
    $line =~ tr/a-z0-9/ /cs;
    for $word (split ' ', $line ) {
        $word =~ s/^\d+//;
        next unless length($word) > 2;
        if ($stemmer) {
            push @result, &$stemmer($word);
        } else {
            push @result, $word;
        }
    }
    @result;
}

sub search {
    my %score;
    my $maxhits = $opt_maxhits;
    my (@unknown, @stop);

    &initstop if $opt_verbose;
    for $word (normalize(@_)) {
        unless ($IF{$word}) {
            if ($stop{$word}) {
                push @stop, $word;
            } else {
                push @unknown, $word;
            }
            next;
        }
        my %post = unpack($p.'*',$IF{$word});
        my $idf = log($FN{'last'}/$IDF{$word});
        for $did (keys %post) {
            my ($maxtf) = unpack($p, $FN{$did});
            $score{$did} = 0 unless defined $score{$did}; # perl -w 
            $score{$did} += $post{$did} / $maxtf * $idf;
        }
    }
    if ($opt_verbose) {
        print "Unkown:  @unknown\n" if @unknown;
        print "Ingnore: @stop\n" if @stop;
    }
    if ($opt_menu) {
        my @menu;
        my $answer = '';
        my $no = 0;
        my @s = ('1' .. '9', 'a' .. 'z');
        my %s;
        
        for $did (sort {$score{$b} <=> $score{$a}} keys %score) {
            my ($mtf, $path) = unpack($p.'a*', $FN{$did});
            my $s = $s[$no];
            push @menu, sprintf "%s %6.3f %s\n", $s, $score{$did}, $path;
            $s{$s} = ++$no;
            last unless --$maxhits;
        }
        &cbreak('on') if $opt_cbreak;
        while (1) {
            print @menu;
            print "\nEnter Number or 'q'> ";
            if ($opt_cbreak) {
                read(TTYIN,$answer,1);
                print "\n";
            } else {
                $answer = <STDIN>;
            }
            last if $answer =~ /^q/i;
            $answer = ($s{substr($answer,0,1)})-1;
            if ($answer >= 0 and $answer <= $#menu) {
                my $selection = $menu[$answer];
                if ($selection =~ m:/man:) {
                    my ($page, $sect) = 
                        ($selection =~ m:([^/]*)\.(.{1,3})$:);
                    print STDERR "Running man $sect $page\n";
                    system 'man', $sect, $page;
                } else {
                    my ($path) = ($selection =~ m:(\S+)$:);
                    $path = $prefix.'/'.$path;
                    print STDERR "Running pod2man $path\n";
                    system "pod2man --official $path | $nroff -man | $pager";
                }
            } else {
                my $path = $prefix."/bin/perlindex";
                system "pod2man --official $path | $nroff -man | $pager";
            }
        }
        &cbreak('off') if $opt_cbreak;
    } else {
        for $did (sort {$score{$b} <=> $score{$a}} keys %score) {
            printf("%6.3f %s\n", $score{$did}, 
                   (unpack($p.'a*', $FN{$did}))[1]);
            last unless --$maxhits;
        }
    }
}

sub cbreak {
    my $mode = shift;
    if ($mode eq 'on') {
        open(TTYIN, "</dev/tty") || die "can't read /dev/tty: $!";
        open(TTYOUT, ">/dev/tty") || die "can't write /dev/tty: $!";
        select(TTYOUT);
        $| = 1;
        select(STDOUT);
        $SIG{'QUIT'} = $SIG{'INT'} = 'cbreak';
	ReadMode 3; # Set cbreak mode
    } else {
	ReadMode 0; # Restore non-cbreak mode
    }
}


$stopinited = 0;                # perl -w
sub initstop {
    return if $stopinited++;
    while (<DATA>) {
        next if /^\#/;
        for (normalize($_)) {
          $stop{$_}++;
        }
    }
}

=head1 NAME

perlindex - index and query perl manual pages

=head1 SYNOPSIS

    perlindex -index

    perlindex tell me where the flowers are

=head1 DESCRIPTION

"C<perlindex -index>" generates an AnyDBM_File index which can be
searched with free text queries "C<perlindex> I<a verbose query>".

Each word of the query is searched in the index and a score is
generated for each document containing it. Scores for all words are
added and the documents with the highest score are printed.  All words
are stemed with Porters algorithm (see L<Text::English>) before
indexing and searching happens.

The score is computed as:

    $score{$document} += $tf{$word,$document}/$maxtf{$document}
                         * log ($N/$n{$word});

where

=over 10

=item C<$N>

is the number of documents in the index,

=item C<$n{$word}>

is the number of documents containing the I<word>,

=item C<$tf{$word,$document}>

is the number of occurances of I<word> in the I<document>, and

=item C<$maxtf{$document}>

is the maximum freqency of any word in I<document>.

=back

=head1 OPTIONS

All options may be abreviated.

=over 10

=item B<-maxhits> maxhits

Maximum numer of hits to display. Default is 15.

=item B<-menu>

=item B<-nomenu>

Use the matches as menu for calling C<man>. Default is B<-menu>.q

=item B<-cbreak>

=item B<-nocbreak>

Switch to cbreak in menu mode or dont. B<-cbreak> is the default.

=item B<-verbose>

Generates additional information which query words have been not found
in the database and which words of the query are stopwords.

=back

=head1 EXAMPLE

    perlindex foo bar

    1  3.735 lib/pod/perlbot.pod
    2  2.640 lib/pod/perlsec.pod
    3  2.153 lib/pod/perldata.pod
    4  1.920 lib/Symbol.pm
    5  1.802 lib/pod/perlsub.pod
    6  1.586 lib/Getopt/Long.pm
    7  1.190 lib/File/Path.pm
    8  1.042 lib/pod/perlop.pod
    9  0.857 lib/pod/perlre.pod
    a  0.830 lib/Shell.pm
    b  0.691 lib/strict.pm
    c  0.691 lib/Carp.pm
    d  0.680 lib/pod/perlpod.pod
    e  0.680 lib/File/Find.pm
    f  0.626 lib/pod/perlsyn.pod
    Enter Number or 'q'>

Hitting the keys C<1> to C<f> will display the corresponding manual
page. Hitting C<q> quits. All other keys display this manual page.

=head1 FILES

The index will be generated in your man directory. Strictly speaking in 
C<$Config{man1direxp}/..>

    The following files will be generated:

    index_fn           # docid -> (max frequency, filename)
    index_idf          # term  -> number of documents containing term
    index_if           # term  -> (docid, frequency)*
    index_seen         # fn    -> indexed?
    

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=cut

__END__
# freeWAIS-sf stopwords
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
each
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found "
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
perl
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you

