#!/usr/bin/perl -w

## Copyright (c) 2002-2003, Index Data.
## $Id: tkl-getharv,v 1.13 2006/05/22 10:19:24 sondberg Exp $

## This script runs through a TKL portal and looks for TKL files
## containing URLs to harvest.
##
## By:	Anders Snderberg Mortensen <sondberg@indexdata.dk>
##      Indexdata, Copenhagen.

use TKL;
use TKL::Log qw(:log_levels);
use TKL::Settings qw(:tkl_config); 
use strict;

tkl_set_config( usage           => \&usage,
                assign          => [    "root=s",
                                        "path=s",
                                        "xpath=s",
                                        "url=s",
                                        "doctype=s",
                                        "help",
                                        "output=s",
                                        "logfile=s",
                                        "verbose"]      );

my $root = tkl_read_config('root') || usage();
my $path = tkl_read_config('path') || "";
my $verbose = tkl_read_config('verbose') || 0;
my $doctype = tkl_read_config('doctype') || "";
my $task_file = tkl_read_config('output') || "/tasks/task-auto.tkl";
my $harvest_xpath = tkl_read_config('xpath') || "/*/harvest";
my $url_xpath = tkl_read_config('url') || "/*/identifier";
my $task_url_field = "url";
my $logfile = tkl_read_config('logfile') || "/var/log/tkl/tkl-getharv.log";
my $help = tkl_read_config('help') || 0;

usage() if $help;

if ( length( $doctype ) ) {
    $harvest_xpath =~ s/\/\*/\/$doctype/;
    $url_xpath =~ s/\/\*/\/$doctype/;
}

my $logger = new TKL::Log(logfile => $logfile, level => $verbose ? tkl_log_level_all : tkl_log_level_norm);


sub log_die {
    my @msg = @_;

    $logger->log( tkl_log_fatal, @_ );
    exit(1);
}


my $tkl = new TKL(root => $root);

$logger->log( tkl_log_debug, "Scanning portal $root for URLs to harvest..." );
$logger->log( tkl_log_log, "Looking for elements with Xpath [$harvest_xpath]" );
$logger->log( tkl_log_log, "Extracting element [$url_xpath]" );

## Run through the portal and look for records with URLs which
## we want to harvest
process(defined $path ? $path : "", my $href = {});

$logger->log( tkl_log_debug, "Found ", scalar keys %$href, " URL(s) to harvest" ); 

## If we found something, update the automatic harvester task file
update_task_file ($href) if keys %$href;


sub normalize_url {
    my $url = shift;

    $url =~ s/^\s+//;
    $url =~ s/\s+$//;
    $url =~ s/&/&amp;/g;
    
    return undef unless $url =~ /^http:\/\//;
    return $url;
}


sub process {
    my ($dir, $href) = @_;
    my $this_dir = $tkl->browse($dir);
    my $files = $this_dir->{files};
    my $dirs = $this_dir->{dirs};


    $logger->log( tkl_log_debug, "Browsing directory $dir ..." );

    foreach my $file (@$files) {
	my $doc_root = $file->document_element;

        if ( !$doc_root ) {
            $logger->log( tkl_log_warn, "Unable to parse file [", $file->portal_filename( ), "]" );
            next;
        }

	my ($harvest_node) = $doc_root->findnodes( $harvest_xpath );
	my ($url_node) = $doc_root->findnodes( $url_xpath );
	next unless defined $harvest_node && $url_node;

	next unless $harvest_node->getFirstChild->nodeValue eq "1";
	
	my $url = normalize_url($url_node->getFirstChild->nodeValue);
	
	if ( defined( $url ) ) {
            $logger->log( tkl_log_debug, "Adding [$url] to list of URLs to harvest" );
	    $href->{$url} = 1;
	} else {
	    $logger->log( tkl_log_warn, "Unrecognizable URL formet in TKL file [" . $file->portal_filename( ) . "]" );
	}
    }
    
    foreach my $subdir (@$dirs) {
	process("$dir/$subdir", $href);
    }
}


sub update_task_file {
    my ($href) = @_;
    my $root = $tkl->find_portal_root;
    my $file = "$root/$task_file";
    my $tkl_file = $tkl->get_file($task_file);

    $logger->log( tkl_log_log, "Updating task file $file ..." );

    my $task_root = $tkl_file->document_element;

    if ( !$task_root ) {
        $logger->log( tkl_log_warn, "Unable to open/parse [$file]" );
        return 0;
    }

    my $dom = $task_root->getOwnerDocument;
    my @url_nodes = $task_root->findnodes($task_url_field);

    ## First remove what's already there...
    foreach (@url_nodes) {
	$task_root->removeChild($_);
    }

    ## Insert our newly collected URLs...
    foreach my $url (keys %$href) {
	my $url_node = $dom->createElement($task_url_field);
	my $url_text = $dom->createTextNode($url);

	$url_node->appendChild($url_text);
	$task_root->appendChild($url_node);
    }

    my $openmode =  open_mode_utf8(">");
    
    open(FH, $openmode, $file) 
	or log_die "Unable to open file '$file' for writing: $!";
    print FH $dom->toString(1), "\n";
    close FH or $logger->log( tkl_log_warn,  "Unable to close file '$file' properly: $!" );
}

sub open_mode_utf8 {
    # use open ':utf8' ; #does not work in perl 5.61 from debian stable
    my ($mode) = @_ ;
    $mode = $mode . ":utf8" if ($] >= 5.008);
    return $mode;
}

sub usage {
    print STDERR "Usage: $0 [OPTIONS]\n\n";
    print STDERR "   -r, --root                 TKL document root\n";
    print STDERR "   -p, --path                 Path to scan relative to document root\n";
    print STDERR "   -v, --verbose              Verbose output\n";
    print STDERR "   -x, --xpath                Look for harvest information at this xpath,\n";
    print STDERR "                              default value is /*/harvest\n";
    print STDERR "   -u, --url                  Extract URL from this xpath,\n";
    print STDERR "                              default value is /*/identifier\n";
    print STDERR "   -d, --doctype              Only look for TKL documents of\n";
    print STDERR "                              this type, default is any type '*'\n";
    print STDERR "                              NB: This setting is depricated, use\n";
    print STDERR "                              the settings -u and -x instead\n";
    print STDERR "   -h, --help                 This help text\n";
    print STDERR "   -l, --logfile              Specify your own logfile\n";
    print STDERR "   -o, --output               Store pending URLs in this task file\n";
    print STDERR "\n";
    exit(1);
}


__END__

=head1 NAME

tkl-getharv - Extract URLs to be web harvested from a Keystone portal

=head1 SYNOPSIS

% tkl-getharv -r /my/tkl/portal/root

=head1 DESCRIPTION

This command line tool extracts those URLs from a Keystone portal, which belong
to a tkl document with the xpath element set to true. The xpath can be 
specified as a command line switch (-x). The default value is /*/harvest.

Such URLs are inserted into a task document in the tasks sub-directory of your
Keystone portal (or whatever is specified using the -o switch). When the
Keystone web-harvester is configured and initialized, the task file is
subsequently picked up and being processed.

=head2 Command-line arguments

This script accepts the following command line switches:
    
  -r, --root                    TKL/Keystone document root

  -p, --path                    Path to scan relative to document root
  
  -v, --verbose                 Verbose logging
  
  -x, --xpath                   Look for harvest information at this xpath,
                                default value is /*/harvest
                                
  -u, --url                     Extract URL from this xpath, default value
                                is /*/identifier
                                
  -d, --doctype                 Only look for TKL documents of this type,
                                default is any type '*'
                                NB: This setting is depricated, use the
                                settings -u and -x instead
                                
  -h, --help                    Display this help text
  
  -l, --logfile                 Specify your own logfile
  
  -o, --output                  Store pending URLs in this task file


=head1 AUTHOR

Anders Snderberg Mortensen <sondberg@indexdata.dk>
Indexdata, Copenhagen, Denmark
2006/05/22

=head1 SEE-ALSO

Man-pages for the Keystone web-harvester, man-pages for the other Keystone
components and possibly for the Perl packages XML::LibXML::***

=cut

