#!/usr/bin/perl -w
#
# Mini search engine 1.3, based on Guppie 1.0: Minimal HTTP server
# Copyright (C) 1999 B.W. van Schooten
#
# In:
# `GET' <filespec> `\n'
# (<parameter> `\n')*
# `\n'
# Out:
# (<parameter> `\n')*
# `\n'
# <data>
#
# Mod 19992809 BOris, modified for new AltaVista
# Mod 19990303 Boris, modified to enable `streaming' Alta Vista hits
#              Now connects to port 8181
# Mod 19990513 Boris, modified so as to filter Alta Vista output
# Mod 19990226 Boris, added `real' HTTP1.0 protocol.


# from manual
require 5.002;

# some includes
use POSIX ":sys_wait_h"; # def. of WNOHANG
use Socket;
use Fcntl;


# make sure server doesn't abort when socket closed by client
$SIG{PIPE} = 'IGNORE';

# allow only files of reasonable size
$maxfilesize = 500000;

# determine port and protocol
$port = 8282;
#$port = $1 if $port =~ /(\d+)/; # untaint port number
$proto = getprotobyname('tcp');

# setup socket
socket(Server, PF_INET, SOCK_STREAM, $proto)	|| die "socket: $!";
setsockopt(Server, SOL_SOCKET, SO_REUSEADDR, pack("l", 1))
												|| die "setsockopt: $!";
bind(Server, sockaddr_in($port, INADDR_ANY))	|| die "bind: $!";

listen(Server,SOMAXCONN)						|| die "listen: $!";

print "GuppieSearch HTTP search engine is now accepting connections on port";
print " $port.\n";

# main loop
while (1) {
	#sleep 1;

	# accept connection
	$paddr=accept(Client,Server);

	# Did we establish new connection?
	if (defined($paddr)) {
		# store information about new client.
		#($port,$iaddr) = sockaddr_in($paddr);
		#$newip = gethostbyaddr($iaddr,AF_INET);

		# parse client's request
		$line = <Client>;
		print "Request: $line";
		while (($linedummy = <Client>) =~ /\w/) {
			print "    Parameter: $linedummy";
		}
		$time = localtime;
		print "    on $time\n";

		# filter request argument
		if ($line =~ /GET\s*\/([^\s]+)/) {
			$filename=$1;
			$filename =~ s/\&search\=Search$//;
		} else {
			$filename="index.html";
		}
		print "    Transferring `$filename': ";

		# output standard protocol blurb
		select Client;
		$|=1;
		print "HTTP/1.1 200 OK\n";
		#print "Date: $time\n"; #Date: Fri, 26 Feb 1999 20:15:48 GMT
		print "Server: GuppieSearch/1.3\n";
		#print "Last-Modified: $time\n"; #Fri, 26 Feb 1999 20:15:24 GMT
		#print "ETag: \"29d620-389-36d700dc\"\n";
		#print "Content-Length: ".length $html."\n";
		print "Accept-Ranges: bytes\n";
		print "Connection: close\n";
		print "Content-Type: text/html\n";
		print "\n";
		select STDOUT;

		# select what to output
		$html="<HTML><HEAD><TITLE>GuppieSearch 1.2</title></head><body>\n";
		$html.="<FORM name=mfrm method=GET action=\"/zoek\">\n";
		if ($filename eq "index.html") {
			$html.="<INPUT NAME=q size=50 maxlength=800 VALUE=\"\">\n";
			$html.="<input type=submit name=search value=\"Search\"></FORM>\n";
			#$html.="Under construction!<br>\n";
			$html.="+naam = naam moet voorkomen<br>\n";
			$html.="-naam = naam mag niet voorkomen<br>\n";
			$html.="\"text\" = text moet letterlijk voorkomen inclusief";
			$html.=" spaties<br></body></html>\n";
		} elsif ($filename =~ /^zoek(.+)/) {
		  $searchkey=$1;
		  # translate search key back to a form useable for form default value
		  $formvalue=$1;
		  $formvalue =~ s/^\?q=//;
		  $formvalue =~ s/\+/ /g;
		  $formvalue =~ s/\%22/&quot/g;
		  $formvalue =~ s/\%2[bB]/+/g;
		  $html.="<INPUT NAME=q size=50 maxlength=800 VALUE=\"$formvalue\">\n";
		  $html.="<input type=submit name=search value=\"Search\"></FORM>\n";
		  for ($i=0; $i<200; $i+=10) {
			# flush current buffer
			print syswrite Client,$html,$maxfilesize;
			print " bytes sent.  ";
			$html="";
			$htmlhits="";
			if ($i==0) {
				open(INDEXOUT,
"echo \"GET /cgi-bin/search$searchkey\"|telnet www.altavista.com 80|"
				);
			} else {
				open(INDEXOUT,
"echo \"GET /cgi-bin/search$searchkey\&stq=$i\"|telnet www.altavista.com 80|"
				);
			}
			$hitlinenr=0;
			while (<INDEXOUT>) {
				print $_;
				#First time round, check for hit statistics
				if ($i==0) {
					if (/([^0-9]*[0-9,]*)\s*pages\s*found\.\s*\<br\>/) {
						$htmlhits .="Total hits: $1<BR>";
					}
					if ( /\>(word\s*count[^<]*)[^>]*\>([^<]*)/ ) {
						$htmlhits .="$1<BR>$2<BR>";
					}
				}
				#Search hits look like:
				#0: <dl><dt>
				#1: <b>[NUMBER]. </b>
				#2: <a href="[URL]">
				#3: <b>[TITLE]</b></a><dt>
				#4: <dd>[TEXT]</dd>
				#5: <br><b>URL: </b>.*
				#   .*
				#   .*
				#   .*
				#   .*
				#   </dl>
				if (/^\<dl\>\<dt\>/ and $hitlinenr==0) {
					$hitlinenr=1;
				} elsif (/^(\<b\>[0-9]+\.\s*)/ and $hitlinenr==1) {
					$_=$1;
					$html.=$_;
					$hitlinenr=2;
				} elsif ($hitlinenr==2) {
					$html.=$_;
					$hitlinenr=3;
				} elsif ($hitlinenr==3) {
					s/\<b\>//;
					s/\<\/dt\>//;
					$html.=$_;
					$hitlinenr=4;
				} elsif ($hitlinenr==4) {
					s/\<dd\>//;
					s/\<\/dd\>//;
					$html.="<br>".$_."<br>\n";
					$hitlinenr=0;
				}
				#} elsif ($hitlinenr==5) {
				#	$html.=$_."<br>\n";
				#}
			}
			close (INDEXOUT);
			$html="<b>".$htmlhits."</b>".$html;
		  }
		}
		$html .= "</body></html>";
		#open (HTTPFILE,
		#"/home/parlevink/schooten/Agents/myownkarin/$filename");
		#binmode HTTPFILE;
		#$htmlsize = sysread HTTPFILE,$html,$maxfilesize;
		#print "$htmlsize bytes read, ";
		##@html = <HTTPFILE>;
		#close (HTTPFILE);

		#for ($i=0; $i<=$#html; $i++) {
		#	print Client $html[$i];
		#}
		#$|=1;
		print syswrite Client,$html,$maxfilesize;
		print " bytes sent.\n";
		close Client;
	}
}




