#!/usr/bin/perl -w
#
# *** getwebref v0.9 alpha (24 jun 1998) ***
#
# Copyright (C) 1998 B.W. van Schooten
#
# Purpose: Find article information on the Web by supplying the title.
# Method: Select the title or a substring of the title with your mouse and
# run the program. After a while, a results file is generated in the current
# directory and you can browse through the information found.
#
# Details: The last X selection (cutbuffer 0) is neatened up and used as an
# exact-match query for the av.yahoo.com database. The documents
# corresponding to the first 20 search hits are retrieved and the 7 lines
# surrounding where the match occurs are put in a HTML file, together with
# the URLs of the documents. Then, Lynx is started to view the results.
#
# Obvious limitations:
#   * Should also allow stdin instead of cutbuffer 0
#   * # of lines of context is not configurable
#   * Document generated is not quite valid HTML because of the rather
#     arbitrary embedding of HTML fragments.
#
# Bugs:
#   * Addresses with nondefault ports don't seem to work properly

# open file to write html page in
open(HTMLOUT,">getwebref-results.html");

# get current X selection 
open(XSELECTION,"~/bin/xselection -cutbuffer 0|");
@xsel=<XSELECTION>;
close(XSELECTION);
print "SELECTION: @xsel\n";


# `neaten up' X selection to be suitable for search engine by: changing
# all nonwords to `+' separators, removing leading and trailing nonwords,
# making everything lowercase.
$_ = "@xsel\n";
while (s/\+/-/) {};
while (s/[^\w\+]+/\+/) {};
s/^\+//;
s/\+$//;
tr/A-Z/a-z/;
$query=$_;
print "QUERY: $query\n";
print HTMLOUT "<html><head><title>Results of $query</title></head><body>\n";

# make connection to av.yahoo.com search engine.
open(INDEXOUT,"echo \"GET /bin/query?p=\%22$_\%22\" | telnet av.yahoo.com 80|");

# modify X selection to fit regexp search `+' -> `\W*'
while ($query =~ s/\+/\\W\*/) {};

# Now, we collect and parse telnet output.
# It is assumed that the URLs of the hits can be identified by searching for:
# <beginning of line> `<LI>' `<A HREF=' <quote> <urlname> <quote> `>'
while(<INDEXOUT>){
	chop;
	# check and remove prefix of hit
	if (s/^\s*\<\s*[lL][iI]\s*\>\s*\<[aA]\s*[hH][rR][eE][fF]="//) {
		# now, remove postfix of hit so we obtain the stripped url.
		s/"\>.*//;
		print"CHECKING $_\n";
		print HTMLOUT "<P>\n<A HREF=\"$_\">$_</a><P>\n";

		# obtain host/port and directory part
		/http:..([\w\d\.\-:]*)(\/.*)/;
		$hostport = $1;
		if (not $hostport =~ s/:/\ /) {
			# no port given? default port.
			$hostport .= " 80";
		}
		print "$hostport   $2 \n";
		open(URLIN,"echo \"GET $2\" | telnet $hostport |");

		# init multiline buffer
		@linebuf=("","","","","","","");
		while($nline=<URLIN>) {
			# strip HTML codes
#			$nline =~ s/\<\s*\w+.*\>//;

			# scroll the multiline buffer;
			shift(@linebuf);
			push(@linebuf,$nline);

			# search for string in middle of buffer
			$_= $linebuf[2] . $linebuf[3] . $linebuf[4];
			tr/A-Z/a-z/;
			if (/$query/) {
				print "@linebuf";
				print HTMLOUT "<UL>\n@linebuf\n</UL>";
				@linebuf=("","","","","","","");
			}
		}
	}
}

print HTMLOUT "</body></html>\n";
close(HTMLOUT);

system("lynx getwebref-results.html");


