websearch.in 4.14 KB
Newer Older
1 2
#!/usr/bin/perl -w
#
3
# Copyright (c) 2000-2008 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
#
use English;
use Getopt::Std;
use File::Basename;

#
# Search our web index. Spits out HTML. Sorry, easier that way!
# 
sub usage {
    print STDERR "Usage: websearch ...\n";
    exit(-1);
}
my $optlist  = "";

#
# Configure variables
#
my $TB		= "@prefix@";
my $WWW         = "$TB/www";
my $index	= "$TB/www/site.index";
my $swish       = "swish-e";
my $searchstring;
my $searchwords;
my $numhits;
my %hits	= ();	# indexed by document.
my %lines       = ();   # indexed by document.
my @order	= ();

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

# Turn off line buffering on output
$| = 1;

#
# Very simple; one argument is the string to search for.
#
usage()
    if (! @ARGV);

$searchstring = $ARGV[0];

#
# Do this from the webdir.
#
chdir($WWW) or
    die("*** $0:\n".
	"    Could not chdir to $WWW!\n");

#
# Run swish. The next version of swish will include a perl API module, so
# we will be able to avoid this extra shell call. For now, we are stuck
# calling swish-e.
#
78
open(SWISH, "swish-e -f $index -x '%r %p %l \"%D\"\n' ".
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
     "-w '$searchstring' |") or
    die("*** $0:\n".
	"    Could not invoke swish-e on '$searchstring'!\n");

#
# Read back results. 
#
while (<SWISH>) {
    #
    # First few lines are special.
    #
    if ($_ =~ /^\#/) {
	if ($_ =~ /^\# Search words: (.*)$/) {
	    $searchwords = $1;
	}
	elsif ($_ =~ /^\# Number of hits: (\d*)$/) {
	    $numhits = $1;
	}
    }
    elsif ($_ =~ /^\.$/) {
	# So we suck everything up to the dot.
	last;
    }
    else {
	#
	# By this point we should have seen the above stuff go by.
	#
	if (!defined($searchwords) || !defined($numhits)) {
	    next;
	}
109
	if ($_ =~ /^(\d*) (.*) (\d*) \"(.*)\"$/) {
110 111
	    my $rank = $1;
	    my $path = $2;
Leigh Stoller's avatar
Leigh Stoller committed
112 113
	    my $size = $3;
	    my $mod  = $4;
114

115
	    #print "$path $mod\n";
Leigh Stoller's avatar
Leigh Stoller committed
116

117
	    $hits{$path}  = [ $rank, $path, $size, $mod ];
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
	    $lines{$path} = [];
	    push(@order, $path);
	}
    }
}
close(SWISH);

#
# Build a query to pass to agrep; this is pathetic, but the next version of
# swish will handle this, so not going to worry; it will do the job.
#
my $query = $searchwords;
$query =~ s/\s+AND\s+/\;/ig;
$query =~ s/\s+OR\s+/\,/ig;
$query =~ s/(?<!\\)\"//ig;

#print "$query\n";

#
# Run agrep on the list of files. 
# 
139
open(AGREP, "agrep -w -i '$query' " . join(" ", keys(%hits)) .
140 141 142 143 144
     " /dev/null 2> /dev/null |") or
    die("*** $0:\n".
	"    Could not invoke agrep with '$query'!\n");

while (<AGREP>) {
Leigh Stoller's avatar
Leigh Stoller committed
145
#    print $_;
146 147
    if ($_ =~ /^(.*): (.*)$/) {
	push(@{ $lines{$1} }, $2)
148
	    if (exists($lines{$1}) && scalar(@{ $lines{$1} }) < 10);
149 150 151 152
    }
}
close(AGREP);

153 154 155 156
if (@order) {
    print "Rank, filename, modification date, matching lines<br><br>\n";
    print "<ul>\n";
}
157
foreach my $hit (@order) {
158
    my ($rank, $path, $size, $mod) = @{ $hits{$hit} };
159
    my $url = $path;
160 161

    print "<hr>\n";
162
    print "<li> $rank - <b><a href=$url>$path</a></b>, $mod<br>\n";
163 164
    print "<ul>\n";
    foreach my $line (@{ $lines{$hit} }) {
165 166
	$line =~ s/<([^>]*)>//ig;	# Matched <...>
	$line =~ s/<[^>]*//ig;		# Unmatched <...
167 168 169 170 171 172
	#$line =~ s/<.*>//ig; 	
	#$line =~ s/<li>//ig;
	#$line =~ s/<\/li>//ig;

	next
	    if ($line eq "");
173 174 175 176 177
	
	print "<li> $line\n";
    }
    print "</ul>\n";
}
178 179 180 181
if (@order) {
    </ul>
}

182 183 184 185 186
print "<hr><br><br>".
      "Web Search powered by <a href=\"http://swish-e.org/\"><b>Swish-e</b></a>".
      "\n";

exit(0);