#!/usr/bin/perl

#####################################################################
#
#   penetrator - personal indexer
#
#   Copyright (C) 2000/2003 Angel Ortega <angel@triptico.com>
#
#   This program is free software; you can redistribute it and/or
#   modify it under the terms of the GNU General Public License
#   as published by the Free Software Foundation; either version 2
#   of the License, or (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#   http://www.triptico.com
#
#####################################################################

use Penetrator;
use Getopt::Long;

use locale;
use POSIX qw (locale_h);

$VERSION = $Penetrator::VERSION;

# trees to index
@trees = ();

# extensions of files to index
@file_extensions = ();

# file types to index, as returned from /usr/bin/file
@file_types = ();

# exclude files having this prefix
@exclude_path = ();

# filters (x2txt)
%filters = ();

# flag for asking /usr/bin/file for the file types
$ask_file = 0;

# 'file' program path with options
$path_to_file = "/usr/bin/file -b";

# verbosity flag
$verbose = 0;

# show files only, not the line containing it
$files_only = 0;

# show only occurrences on the same line
$same_line = 0;

# minimum size for a word to be indexable
$min_word_size = 0;

# driver selector: DBM, DBI
$driver_type = "DBM";

# ### Driver info

# DBM index file
$index_file = "/var/penetrator.db";

# DBM cache directory
$cache_dir = "";

# DBI connection string
$dbi_connection_string = "";

# DBI user and password
$dbi_user = undef;
$dbi_passwd = undef;

# DBI SQL types, defaults are PostgreSQL ones
$dbi_key_code_type = "serial";
$dbi_code_type = "integer";
$dbi_doc_type = "text";
$dbi_word_type = "text";
$dbi_query_type = "text";

# additional SQL commands for creation, usually indexes
@dbi_create_command = ();

# additional SQL commands to be run before recreating the tables
@dbi_drop_command = ();

# maximum number of seconds allowed reindexing
$max_running_time = 3600;

# profile to be loaded from file
$profile = "default";

# available profiles from config file
@available_profiles = ();

# protect with transaction
$transaction = 0;

# use temporary tables on searchs
$tmp_table_search = 1;

# query cache usage flag
$use_query_cache = 1;

# maximum retries on errors inserting words 
# (mostly due to duplicated words concurrently inserted)
# before giving up the indexing of a document
$max_word_insert_retries = 20;

# ### CGI data

# path->URL substitutions
@path_to_url = ();

# style sheet
$cgi_css = "";

# default number of hits per page
$cgi_hits_per_page = 20;

# title for the CGI
$cgi_title_first = "Penetrator Search";
$cgi_title_results = "Penetrator Search Results";


#####################################################################

# parse options
if(!GetOptions( "r|reindex"		=>	\$reindex,
		"z|zap" 		=>	\$zapindex,
		"l|files-with-matches"	=>	\$files_only,
		"version"		=>	\$show_version,
		"f|file=s"		=>	\$config_file,
		"s|same-line"		=>	\$same_line,
		"p|profile=s"		=>	\$profile,
		"v|verbose"		=>	\$verbose,
		"t|transaction" 	=>	\$transaction,
		"very-verbose"		=>	\$insanely_verbose)
	      or $usage)
{
	usage();
}

$verbose = 1 if $insanely_verbose;

if($show_version)
{
	print "$VERSION\n";
	exit(0);
}

# if program is named .cgi, act as a CGI
$is_cgi = ( $0 =~ /\.cgi$/ ? 1 : 0);

# if config_file is not defined, try several possibilities
unless($config_file)
{
	$config_file = "$ENV{'HOME'}/.penetratorrc";
	$config_file = "./.penetratorrc" if $is_cgi and not -f $config_file;
	$config_file = "/etc/penetratorrc" unless -f $config_file;
}

print "Using profile $profile\n" if $verbose;

read_config($config_file);

# Create the Penetrator object

$ph = new Penetrator('type'		=> $driver_type,
		   'min_word_size'	=> $min_word_size,
		   'zap'		=> $zapindex,
		   'disable_query_cache' => !$use_query_cache,
		   'index_file' 	=> $index_file,
		   'cache_dir'		=> $cache_dir,
		   'dbi_string' 	=> $dbi_connection_string,
		   'dbi_user'		=> $dbi_user,
		   'dbi_passwd' 	=> $dbi_passwd,
		   'transaction'	=> $transaction,
		   'insert_retries'	=> $max_word_insert_retries,
		   'key_code_type'	=> $dbi_key_code_type,
		   'code_type'		=> $dbi_code_type,
		   'doc_type'		=> $dbi_doc_type,
		   'word_type'		=> $dbi_word_type,
		   'query_type' 	=> $dbi_query_type,
		   'drop_commands'	=> \@dbi_drop_command,
		   'create_commands'	=> \@dbi_create_command
		);

print "Using $driver_type driver\n" if $verbose;

if($is_cgi)
{
	penetrator_cgi($ph);
}
elsif($reindex)
{
	penetrator_reindex($ph);
}
elsif($zapindex)
{
	print "Index Zapped.\n" if $verbose;
}
else
{
	# by default, just search
	usage() unless scalar(@ARGV);

	penetrator_console_search($ph,@ARGV);
}

$ph->close();

exit(0);


sub read_config
# reads the config file
{
	my ($fn) = @_;
	my ($pp);

	open F, $fn or die "Can't open configuration file '$fn'";

	while(<F>)
	{
		my ($key,$value);

		chop;
		next if /^$/;
		next if /^#/;

		if(/\[(.*)\]/)
		{
			$pp = $1;
			push(@available_profiles, $pp);

			next;
		}

		# skip if profile is not what user want
		next if $pp and $pp ne $profile;

		($key,$value) = /([^:]*):\s*(.*)/;

		if($key eq "dbm_index_file" or
		   $key eq "plain_index_file" )
		{
			$index_file = $value;
		}
		elsif($key eq "dbm_cache_dir" or
		      $key eq "plain_cache_dir" )
		{
			$cache_dir = $value;
		}
		elsif($key eq "ask_file")
		{
			$ask_file = $value;
		}
		elsif($key eq "path_to_file")
		{
			$path_to_file = $value;
		}
		elsif($key eq "file_exts")
		{
			@file_extensions = (@file_extensions, split(/ /,$value));
		}
		elsif($key eq "file_types")
		{
			@file_types = (@file_types, split(/ /,$value));
		}
		elsif($key eq "trees")
		{
			foreach my $tree (split(/ /,$value))
			{
				if(-d $tree)
				{
					push(@trees, $tree);
				}
				else
				{
					print "warning: [$tree] not a directory\n";
				}
			}
		}
		elsif($key eq "exclude")
		{
			push(@exclude_path, $value);
		}
		elsif($key eq "path_to_url")
		{
			push(@path_to_url, $value);
		}
		elsif($key eq "css")
		{
			$cgi_css = $value;
		}
		elsif($key eq "driver_type")
		{
			$driver_type = $value;
		}
		elsif($key eq "dbi_connection_string")
		{
			$dbi_connection_string = $value;
		}
		elsif($key eq "dbi_key_code_type")
		{
			$dbi_key_code_type = $value;
		}
		elsif($key eq "dbi_code_type")
		{
			$dbi_code_type = $value;
		}
		elsif($key eq "dbi_doc_type")
		{
			$dbi_doc_type = $value;
		}
		elsif($key eq "dbi_word_type")
		{
			$dbi_word_type = $value;
		}
		elsif($key eq "dbi_query_type")
		{
			$dbi_query_type = $value;
		}
		elsif($key eq "dbi_create_command")
		{
			push(@dbi_create_command, $value);
		}
		elsif($key eq "dbi_drop_command")
		{
			push(@dbi_drop_command, $value);
		}
		elsif($key eq "max_running_time")
		{
			$max_running_time = $value;
		}
		elsif($key eq "cgi_hits_per_page")
		{
			$cgi_hits_per_page = $value;
		}
		elsif($key eq "tmp_table_search")
		{
			$tmp_table_search = $value;
		}
		elsif($key eq "cgi_title_first")
		{
			$cgi_title_first = $value;
		}
		elsif($key eq "cgi_title_results")
		{
			$cgi_title_results = $value;
		}
		elsif($key eq "use_query_cache")
		{
			$use_query_cache = $value;
		}
		elsif($key eq "min_word_size")
		{
			$min_word_size = $value;
		}
		elsif($key eq "filter")
		{
			if(my ($f,$c) = ($value =~ /^(\S*)\s+(.*)$/))
			{
				$filters{$f} = $c;
			}
		}
		elsif($key eq "locale")
		{
			setlocale(LC_ALL, $value);
		}
		else
		{
			print "warning: unknown [$key] key\n";
		}
	}

	close F;

	# useful default values
	unless(join(" ",@file_extensions))
	{
		@file_extensions=qw / .htm .html .txt .c .h .pl .cpp .hpp .pm .doc /;
	}

	unless(join(" ",@file_types))
	{
		@file_types = qw ( text/ );
	}

	# trees must exist
	die "At least one 'trees:' line must exist in config file" unless join(" ",@trees);

	return(1);
}


sub is_indexable
# returns true if filename is indexable
{
	my ($filename) = @_;
	my ($new);

	# test if excluded
	foreach my $e (@exclude_path)
	{
		return(undef) if $filename =~ /$e/;
	}

	$new = $filename;

	if($ask_file)
	{
		if(open FILE, $path_to_file . " " . "\"$filename\"|")
		{
			my ($ok) = 0;

			my ($ftype) = <FILE>;
			chop($ftype);
			$ftype =~ s/[;,].*$//;
			close FILE;

			foreach my $i (@file_types)
			{
				if($ftype =~ /$i/)
				{
					$ok = 1;
					last;
				}
			}

			# try to find a filter for that type
			if($ok and $filters{$ftype})
			{
				$new = sprintf("$filters{$ftype}|",$filename);
			}

			# ignore if not one of file types
			unless($ok)
			{
				print "Ignore: $filename ($ftype)\n" if $insanely_verbose;
				return(undef);
			}
		}
		else
		{
			die "Can't pipe to file '$path_to_file'";
		}
	}
	else
	{
		# ignore if not one of the allowed extensions
		$filename =~ /(\.\w*)$/;
		$ext = $1;
		return(undef) unless grep(/^\Q$ext\E$/,@file_extensions) > 0;
	}

	return($new);
}


sub penetrator_reindex
{
	my ($ph) = @_;
	my ($start_time);

	print "Testing for not-yet-existing or modified files...\n"
		if $verbose;

	$ph->get_docs();

	while(my ($filename, $mtime) = $ph->fetch_doc())
	{
		if($ph->test_file($filename, $mtime) and $verbose)
		{
			print "Un-indexing $filename...\n";
		}
	}

	# second step: travel the trees,
	# reindexing all files not found on the database

	print "Reindexing...\n" if $verbose;

	$start_time = time();

	foreach my $dir (@trees)
	{
		chdir($dir);
		open(DIR,"find . -type f|");

		while(<DIR>)
		{
			my ($filename,$ret,$fn);

			$t = time();

			if($t - $start_time > $max_running_time)
			{
				print "Max running time exceeded - exit\n"
					if $verbose;

				return;
			}

			$filename = $_;
			chop $filename;

			# strips leading period
			$filename =~ s/^\.\///;
			$filename = $dir . "/" . $filename;

			unless(($fn = is_indexable($filename)))
			{
				print "No     : $filename\n" if $verbose;
				next;
			}

			# index the document
			$ret = $ph->index_file($fn,$filename);

			if($ret == -1)
			{
				print "Fail   : $filename ($!)\n" if $verbose;
			}
			elsif($ret == -2)
			{
				print "Skip   : $filename\n" if $verbose;
			}
			elsif($ret == -3)
			{
				print "Give up: $filename\n" if $verbose;
			}
			elsif($ret == 0)
			{
				printf "Index  : $filename (%d sec)\n",
					 (time() - $t) if $verbose;
			}
		}

		close(DIR);
	}

	# invalidate cache
	$ph->clear_cache();

	print "Index build finished.\n" if $verbose;
}


##################################
# interfaces
##################################

sub penetrator_console_search
# search to stdout
{
	my ($ph,@keys) = @_;
	my ($res);

	$ph->search(@keys);
	@keys = map { Penetrator::normalize($_); } @keys;

	while (my $file = $ph->fetch())
	{
		if($files_only)
		{
			print "$file\n";
		}
		else
		{
			# open the file and print all
			# occurrences of all keys
			if(open F, $file)
			{
				my ($l);

				while($l = <F>)
				{
					my ($count) = 0;
					my ($n,@l);

					chop($l);
					@l = map { Penetrator::normalize($_); }
						split(/\W+/,$l);
					$n = join(' ',@l);

					foreach my $i (@keys)
					{
						if($n =~ /\b$i\b/i)
						{
							$count++;
							last unless $same_line;
						}
						else
						{
							last if $same_line;
						}
					}

					print "$file: $l\n"
						if(($count == scalar(@keys) and $same_line)
						or ($count == 1 and not $same_line));
				}

				close F;
			}
			else
			{
				print "ERROR: Can't open '$file': $!\n";
			}
		}
	}
}


sub penetrator_cgi
# penetrator as a cgi
{
	my ($ph) = @_;
	my ($params);

	$params = cgi_init();

	print "Content-Type: text/html\n\n";

	print "<!-- Penetrator Search Engine $VERSION - Angel Ortega <angel\@triptico.com> -->\n";

	if(defined($params->{'query'}))
	{
		my (@res,$count,$result);

		print "<head><title>$cgi_title_results for $params->{query}</title></head>\n";
		print "<link rel=StyleSheet href='$cgi_css' type='text/css'>\n" unless $cgi_css eq "none";
		print "<body>\n";
		print "<table width='100\%'><tr><td class=title>\n";
		print "<h1>$cgi_title_results</h1></table>\n";

		$ph->search(split(/[^\w\-\~]+/,$params->{'query'}));

		while(my $l = $ph->fetch())
		{
			push(@res,$l);
		}

		print "<p>There were <b>" . (scalar(@res)?scalar(@res):"NO") .
			"</b> matches for <i>" . $params->{'query'} . "</i>\n<br>";

		$result = 1;
		$count = 0;

		$params->{'count'} = $cgi_hits_per_page unless $params->{'count'};

		foreach my $file (@res)
		{
			my ($title,$url);

			if($result < $params->{'from'})
			{
				$result++;
				next;
			}

			if(defined $params->{'count'} and
			   $count >= $params->{'count'})
			{
				last;
			}

			# open file and extract title,
			# if it's HTML
			if(($file =~ /\.html$/ or $file =~ /\.htm$/)
				and open F, $file)
			{
				my ($doc);

				while(<F>)
				{
					s/[\r\n]//g;

					$doc .= $_;

					if($doc =~ /<title>(.*)<\/title>/i)
					{
						$title = $1;
						last;
					}
				}

				close F;
			}

			# try to substitute initial path to url
			foreach my $i (@path_to_url)
			{
				my ($from,$to) = split(/ /,$i,2);

				if($file =~ /^\Q$from\E(.*)/)
				{
					$url = $to . $1;
				}
			}

			$title = $file unless $title;

			print "<p><b>$result.</b> ";

			if($url)
			{
				# path to url worked: the file can be
				# accesible via the web browser
				print "<a href='$url'>$title</a><br>\n";
				print "<small><b>URL:</b> <code class=url>$url</code></small><br>\n";
			}
			else
			{
				# no path to url substitution:
				# show just the real path to file
				print "$title<br>\n";
				print "<small><b>File:</b> <code class=filename>$file</code></small><br>\n"
					if $title ne $file;
			}

			$result++;
			$count++;
		}

		# index of pages
		if($params->{'count'} < scalar(@res))
		{
			print "<p>";

			$params->{'from'} = 1 unless $params->{'from'};

			my ($page) = 1;
			for(my $c = 1;$c <= scalar(@res);$c+=$params->{'count'},$page++)
			{
				if($c == $params->{'from'})
				{
					print "$page ";
				}
				else
				{
					my ($cgi) = $ENV{"SCRIPT_NAME"};
					print "<a href='$cgi?query=$params->{'query-url-encoded'}&from=$c&count=$params->{'count'}'>";
					print "$page</a> ";
				}
			}

			print "<br>\n";
		}

		print "<hr>\n";
	}
	else
	{
		print "<head><title>$cgi_title_first</title></head>\n";
		print "<link rel=StyleSheet href='$cgi_css' type='text/css'>\n" unless $cgi_css eq "none";
		print "<body><table width='100\%'><tr><td class=title>\n";
		print "<h1>$cgi_title_first</h1></table>\n";
	}

	print "<br><form>\n";
	print "<input name=query value=\"$params->{query}\">\n";
	print "<input type=submit value='Search'>\n";
	print "</form>\n";

	exit(0);
}


sub cgi_init
{
	my ($class) = @_;
	my (%params,$f);

	if($ENV{'REQUEST_METHOD'} eq "GET")
	{
		$f = $ENV{'QUERY_STRING'};
	}
	else
	{
		# POST
		read(STDIN,$f,$ENV{'CONTENT_LENGTH'});
	}

	foreach my $p (split('&', $f))
	{
		my ($key,$val);

		if($p =~ /(.*)=(.*)/)
		{
			($key,$val) = ($1,$2);

			$params{"$key-url-encoded"} = $val;

			$val =~ s/\+/ /g;
			$val =~ s/%(..)/pack('c',hex($1))/eg;

			$params{$key} = $val;
		}
	}

	return(\%params);
}


sub usage
{
	print("penetrator $VERSION - personal indexer (console+CGI)\n");
	print("Copyright (C) 2000/2003 Angel Ortega <angel\@triptico.com>\n\n");

	print "Usage:\n";
	print "\n";
	print "penetrator -r [-v] [-f config_file]\n";
	print "penetrator key [key...] [-l] [-s] [-f config_file]\n";
	print "\n";
	print "    -r|--reindex 		Rebuild index.\n";
	print "    -z|--zap			Zap index (blank it).\n";
	print "    -l|--files-with-matches	Print only files matching.\n";
	print "    -s|--same-line		Print only lines containing all keys.\n";
	print "    -t|--transaction		Protect reindexing under a transaction\n";
	print " 				(DBI driver must support it).\n";
	print "    -v|--verbose 		Be verbose\n";
	print "    --very-verbose		Be insanely verbose.\n";
	print "    -f|--file			Set config file. (Defaults:\n";
	print " 				~/.penetratorrc, ./.penetratorrc (CGI only)\n";
	print " 				and /etc/penetratorrc)\n";
	print "    -p|--profile=PROFILE 	Select profile (see below)\n";
	print "    --version			Print version and exit\n";

	print "\nAvailable profiles:\n";
	print "@available_profiles\n";

	exit(1);
}


# kdsxcvii

