#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell

## Copyright (c) 2002-2003, Index Data.
## $Id: tkl-urlcheck,v 1.18 2005/11/06 08:50:07 sondberg Exp $

use TKL::URLcheck;
use TKL::Settings qw(:tkl_config);
use TKL::File;
use TKL::Log qw(:log_levels);
use XML::LibXML;
use LWP::UserAgent;
use bytes;                      ## Mostly to safisfy Solaris users
use strict;

tkl_set_config(	usage           => \&usage,
		local		=> "urlcheck.tkl",
		assign		=> [	"root=s",
                                        "which_agent=s",
			  		"path=s",
					"help",
					"output=s",
					"base_url=s",
					"from=s",
					"log=s",
					"nomail",
					"attempts=i",
					"message=s",
					"success=s",
                                        "get",
					"email_subject=s",
					"creator=s",
					"timeout=i",
					"debug"]	);

my $root = tkl_read_config('root') || usage();
my $smtp_agent = tkl_read_config('which_agent') || undef;
my $subdir = tkl_read_config('path') || "";
my $debug = tkl_read_config('debug') || 0;
my $http_get = tkl_read_config('get') || 0;
my $help = tkl_read_config('help') || 0; 
my $badurls_spec = tkl_read_config('output') || "$root/badurls.tkl";
my $no_mail = tkl_read_config('nomail') || 0;
my $attempts = tkl_read_config('attempts') || 3;
my $logfile = tkl_read_config('log') || "/var/log/tkl/urlchecker.log";
my $baseurl = tkl_read_config('base_url') || "http://no.base.url/specified";
my $email_from = tkl_read_config('from') || "TKL urlchecker <urlcheck\@localhost>";
my $mail_subject = tkl_read_config('email_subject') 
    || "Keystone URL checker report";
my $mail_message = tkl_read_config('message') 
    || "{NAME}

You have last modified the Keystone record

  {RECORD}

containing the URL

  {URL} .
 
This URL did not resolve {ATTEMPTS} times. Please update the URL by 
clicking the following edit link:

  {CORRECT} .


(This is a default mesage emitted by the Keystone URL checker. Please 
edit content and form of this message by placing a 'urlcheck.tkl' 
config file in the portal root)
 
";
my $timeout = tkl_read_config('timeout') || 30;
my $default_user;
my $tkl = new TKL::URLcheck(root => $root);
my $logger = new TKL::Log(logfile => $logfile, level => $debug ? tkl_log_level_all : tkl_log_level_norm);

if (defined(my $default_creator = tkl_read_config('creator'))) {
    $default_user = $tkl->get_user_info($default_creator) or
        $logger->log(tkl_log_warn, "Unknown default recipient [", $default_creator, "]");
}   

usage() if $help;

## Add accepted HTTP response codes to this list
my $accepted_response_codes = make_response_codes(tkl_read_config('success')) || {200=>1};
my @extract = qw(URL FILENAME CHECKEDNUMBER);
my @store = (@extract, qw(USER ERRORCODE CHECKEDLAST));
my $script_name = "TKL urlcheck/1.0";
my $ua = new LWP::UserAgent(timeout => $timeout, agent => $script_name);
my $matching = [];
my $unique_id = "FILENAME"; ## Use FILENAME as ID so each record gets checket (even if URL already been checked)
my $recs = {};
my @badurls = ();
my $badurls_file = defined($badurls_spec) ? $badurls_spec : "$root/badurls.tkl";

$logger->log(tkl_log_log, "Welcome to ", $script_name);
$logger->log(tkl_log_log, "Analyzing XML schemas...");

if ( defined($smtp_agent) ) {
    if ( -x $smtp_agent ) {
        $logger->log( tkl_log_log, "Using SMTP agent: $smtp_agent" );
    } else {
        $logger->log( tkl_log_warn, "Couldn't find SMTP agent: $smtp_agent" );
        $logger->log( tkl_log_warn, "Won't send any mails..." );
        $no_mail = 1;
    }
} else {
    $logger->log( tkl_log_log, "Using default SMTP agent: $TKL::User::mailer" );
}

my $schemas = $tkl->find_schemas();

unless (defined($schemas) && scalar keys %$schemas) {
    $logger->log(tkl_log_warn, "Found no schemas");
    exit(0);
}

## Extract URL fields
foreach my $schema (keys %$schemas) {
    if ( ! -r $schemas->{$schema} ) {
        $logger->log(tkl_log_warn, "Schema '$schema' is not readable" );
        next;
    }
    
    my $elements = $tkl->find_uri_elem($schemas->{$schema});
    if (scalar @$elements) {
	$logger->log(tkl_log_log, "Found URL field(s) in XML schema $schema, extracting URLs...");
	$tkl->find_urls($subdir, $elements, $schema, $matching);
    } else {
	$logger->log(tkl_log_debug, "Dropping XML schema $schema");
    }
}

## Update table
foreach my $entry (@$matching) {
    $recs->{$entry->{$unique_id}} = $entry;
}

my $number = scalar keys %$recs;

if ($number) {
    $logger->log(tkl_log_log, "Found $number distinct TKL documents with URL content");
} else {
    $logger->log(tkl_log_warn, "Unable to find any TKL documents with URL content, giving up!");
    $logger->log(tkl_log_warn, "Remember to use XML schema type xs:anyURI for fields with URL content to be probed");
    exit(0);
}

## Check if we already have badurls.tkl file with status...
if (-f $badurls_file) {
    $logger->log(tkl_log_log, "Found old URL check log file:", $badurls_file);
    my $badurls = new TKL::File(file => $badurls_file, root => $root);
    $logger->log(tkl_log_log, "Updating record information...");

    if ( my $badurls_docroot = $badurls->document_element ) {
        browse_badurls($recs, $badurls->document_element);
    } else {
        $logger->log(tkl_log_warn, "Unable to parse URL check report file [$badurls_file], skipping..." );
    }
}

my $count = 0;
# avoid division by zero for $number < 10
my $mod = 1+ (int $number / 10);

$logger->log(tkl_log_log, "Shuffling list of documents...");
fisher_yates_shuffle (my $file_list = [keys %$recs]);

$logger->log(tkl_log_log, "Ready to URL check");

## Perform the URL checking
foreach my $file (@$file_list) {
    if ($count % $mod == 0) {
	$logger->log(tkl_log_log, "Progress: ", sprintf("%0.1f", 100 * $count / $number), "%");
    }
    $count ++;
    my $rec = $recs->{$file};
    my $url = $rec->{URL};
    
    $logger->log(tkl_log_debug, "Checking URL: $url");
    
    my ($http_code, $msg) = check_url($ua, $url);
    
    $logger->log(tkl_log_debug, "HTTP response code: ", $msg);

    if ($accepted_response_codes->{$http_code}) {
	$logger->log(tkl_log_debug, "URL [$url] OK");
    } else {
	$logger->log(tkl_log_debug, "URL [$url] Not OK");
	$rec->{CHECKEDNUMBER} ++;
	$rec->{ERRORCODE} = $msg;
	$rec->{CHECKEDLAST} = get_timestamp();
	push @badurls, $rec;
    }
}

$logger->log(tkl_log_log, "Progress: 100.0 %");
$logger->log(tkl_log_log, "Found ", scalar @badurls, " bad URLs.");
$logger->log(tkl_log_log, "Updating URL check report ", $badurls_file);

if (-f $badurls_file) {
    my $backup = $badurls_file . ".old";
    $logger->log(tkl_log_log, "Storing old URL check report into $backup");
    system("cp " . $badurls_file . " $backup") &&
    	$logger->log(tkl_log_warn, "Unable to store old version: $?");
}

my $badurls_xml = generate_report(\@badurls);
my $write_file = 1;

open OUTPUT, ">" . $badurls_file or $write_file = 0;

if ($write_file) {
    print OUTPUT $badurls_xml;
    close OUTPUT;
} else {
    $logger->log(tkl_log_warn, "Unable to store URL check report in file ", $badurls_file, ": $!");
}


sub make_response_codes {
    my ($raw) = @_;
    my $ret = {};

    return unless defined $raw;

    $raw =~ s/\s*//g;
    
    foreach (split /,/, $raw) {
	$ret->{$_} = 1;
    }

    return $ret;
}


sub check_url {
    my ($ua, $url) = @_;
    my $r;
    
    if ($http_get) {
        $r = $ua->get($url);
    } else {
        $r = $ua->head($url);
    }

    if ($debug) {
	my @sub_req = ();
	my $sub = $r;
	while (my $prev = $sub->previous()) {
	    push @sub_req, $prev;
	    $sub = $prev;
	}
	foreach my $sub (reverse @sub_req) {
	    $logger->log(tkl_log_debug, "Sub-request response: ", $sub->status_line);
	}
    }

    return ($r->code(), encodeToUTF8( 'ISO-8859-1', $r->status_line ) );
}


sub generate_report {
    my ($list) = @_;
    my $badurls_xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<badurls>\n";
    
    foreach my $rec (@$list) {
	$logger->log(tkl_log_debug, "Bad URL: ", $rec->{URL}, " in file ", $rec->{FILENAME});
	$badurls_xml .= "  <BADURL>\n";
	foreach my $field (@store) {
	    next unless defined $rec->{$field};
	    $badurls_xml .= "    <$field>" . $tkl->string2xmlstring($rec->{$field}) . "</$field>\n";
	}
	$badurls_xml .= "  </BADURL>\n";
	if (($rec->{CHECKEDNUMBER} >= $attempts) && (!$no_mail)) {
	    send_mail($rec);
	}
	
    }
    
    $badurls_xml .= "</badurls>\n";

    return $badurls_xml;
}


sub send_mail {
    my ($rec) = @_;

    if (!defined($rec->{USER}) or !length($rec->{USER})) {
	$logger->log(tkl_log_warn, "No author information found in document [", $rec->{FILENAME}, "]");
	if (defined($default_user)) {
	    $logger->log(tkl_log_warn, "Using default editor [", $default_user->get_field('email'), "]");
	    $rec->{USER} = $default_user->get_field('login');
	} else {
	    return;
	}
    }
    
    $logger->log(tkl_log_log, "Sending reminder to author [", $rec->{USER}, "] concerning document [", $rec->{FILENAME}, "]");
    
    my $user = $tkl->get_user_info($rec->{USER});

    unless ($user) {
	$logger->log(tkl_log_warn, "Unknown TKL user [", $rec->{USER}, "]");
	if ($default_user) {
	    $user = $default_user;
	    $logger->log(tkl_log_warn, "Using default recipient [", $default_user->get_field('email') , "]");
	} else {
	    $logger->log(tkl_log_warn, "Unable to send mail to creator/modifier of TKL file [", $rec->{FILENAME}, "]");
	    return;
	}
    }
    
    if ($user) {
	my ($record) = $rec->{FILENAME};
	
	$record =~ s/^\Q$root\E//;
	$record .= "/" unless $record =~ /^\//;
	
	$rec->{RECORD} = $record;
	$rec->{CORRECT} = $baseurl . "$record?edit=1";
	$rec->{NAME} = $user->get_field('name');
	$rec->{ATTEMPTS} = $attempts;
	
	my $subject = $user->user_template($rec, $mail_subject);
	my $message = $user->user_template($rec, $mail_message);

        if ( defined($smtp_agent) ) {
            $user->{mailer} = $smtp_agent;
        }

	$user->set_from_addr($email_from);
	$user->send_mail($subject, $message);
    }
}


sub browse_badurls {
    my ($href, $node) = @_;
    my @nodes = $node->findnodes("BADURL");

    foreach my $rec_node (@nodes) {	## Run through all the old entries
	my $rec = {};
	my $valid = 1;
	foreach my $field (@extract) {	## Get the needed information
	    my ($value_node) = $rec_node->findnodes($field);
	    unless (defined $value_node) {	## Check if it's there...
		$logger->log(tkl_log_warn, "Unable to find element '$field'");
		$valid = 0;
		last;
	    }
	    $rec->{$field} = $value_node->getFirstChild->nodeValue;
	}
	my $index = $rec->{$unique_id};
	if ($valid && defined($href->{$index})) {	## If the entry already exists..
	    my $old = $href->{$index};
	    my %new = (%$rec, %$old);
	    $href->{$index} = \%new;			## override it with old info from badurls.tkl
	}
	if ($debug && $valid && !defined($href->{$index})) {
	    $logger->log(tkl_log_warn, "Removing file '", $rec->{'FILENAME'}, "' from URL check list");
	}
    }
}


sub fisher_yates_shuffle {
    my ($array) = @_;
    my $i;

    for ($i = @$array; --$i; ) {
	my $j = int rand ($i + 1);
	next if $i == $j;
	@$array[$i, $j] = @$array[$j, $i];
    }
}

sub get_timestamp {
    my ($day, $month, $year, $sec, $min, $hour) = (localtime)[3, 4, 5, 0, 1, 2];
    my $tm = sprintf("%d/%02d/%02d, %02d:%02d:%02d",
    		     $year + 1900, $month + 1, $day, $hour, $min, $sec);

    return $tm;
}


sub usage {
    print STDERR "Usage: $0 -r tkl-portal-root [options]\n\n";
    print STDERR "Options:\n";
    print STDERR "  -r path             TKL portal root\n";	
    print STDERR "  -d                  Debugging mode\n";
    print STDERR "  -h                  Shows this information\n";
    print STDERR "  -o file             Use alternative badurls.tkl\n";
    print STDERR "  -b base-URL         What is the portal base URL?\n";
    print STDERR "  -n                  Don't attempt to send any mails\n";
    print STDERR "  -w /path/to/agent   Use alternative SMTP agent\n";
    print STDERR "  -l logfile          Specify an alternative logfile\n";
    print STDERR "  -a integer          No. of tolerated unsuccesfull URL checks\n";
    print STDERR "  -p sub-directory    Restrict scanning to sub-directory of portal\n";
    print STDERR "  -t timeout          HTTP timeout in seconds\n";
    print STDERR "  -s code1,code2,..   Successful HTTP response codes\n";
    print STDERR "  -g                  Use HTTP GET instead of HEAD\n";
    print STDERR "  -m email_message    Bad URL email message\n";
    print STDERR "  -e email_subject    Bad URL email subject\n";
    print STDERR "  -f email_from_addr  What from-address should be used in email?\n\n";
    exit(0);
}


__END__

=head1 NAME

tkl-urlcheck - URL checking facility for the TKL portal framework.

=head1 SYNOPSIS

  % tkl-urlcheck -r /my/tkl/portal/root

=head1 DESCRIPTION

This is a command line URL checker, which extracts and probes URLs
in your TKL portals.

URLs are identified as XML text nodes embedded in elements which
claim to be of the type XML schema type xs:anyURI.

The simplest way to invoke this component is this:

  % tkl-urlcheck -r /path/to/tkl/portal/root

The -r (or --root=) argument must be specified since this is the
only way for the component to find the portal area.

Thereby, a sequence of actions take place:

  - The portal is scanned for available document types (schemas)
  - Schemas with elements of type xs:anyURI are selected
  - TKL instances of these schemas are found
  - Each URL is contacted and probed
  - The outcome of the probing is kept in a status report
  - Emails are sent to authors of records with URLs unsuccesfully
    probed 3 times or more.

=head2 Command-line arguments

These command line switches/arguments can be used with tkl-urlcheck. Confirm
with TKL::Config(3) for details about what default configuration settings
are used as defaults.

  -r path, --root=path		Path to the TKL portal root (mandatory)
  
  -d, --debug			Debugging mode switch (i.e. verbose logging)
  
  -h, --help			A help text is shown
  
  -o file, --output=file	Store URL check report in this file
  
  -b URL, --base_url=URL	What is the portal base URL?
  
  -n, --nomail			Don't attempt to send any mails
  
  -w, --which_agent	        Use this SMTP agent to send mails	
  
  -l logfile, --log=logfile	Specify an alternative logfile
  
  -a n, --attempts=n		No. of tolerated unsuccesfull URL checks
  
  -p subdir, --path=subdir	Restricts URL scanning to this portal subdir
  
  -f email, --from=email	What from-address should be used in emails?

  -m msg, --message=msg		Bad URL message body
  
  -e subj, --email_subject=subj	Bad URL message subject 

  -t n, --timeout=n		HTTP timeout in seconds

  -s codes, --success=codes	Comma-separated list of successful HTTP
  				response codes

  -g                            Use HTTP GET instead of HEAD

=head2 Config file customization

The behaviour and the messages sendt can be configured by placing a suitable
config file named 'urlcheck.tkl' in the portal root. As template you may use 
the following XML file: 

<?xml version="1.0"?>
<config>

  <!-- Bad URL email header -->
  <setting name="email_subject" value="Keystone URL checker report"/>


  <!-- This is the bad url email message -->
  <!-- Use the {xxxx} syntax for variable insertion -->
  <setting name="message">{NAME}

You have last modified the Keystone record

  {RECORD}

containing the URL

  {URL} .
 
This URL did not resolve {ATTEMPTS} times. Please update the URL by 
clicking the following edit link:

  {CORRECT} .


(This is a default mesage emitted by the Keystone URL checker. Please 
edit content and form of this message by placing a 'urlcheck.tkl' 
config file in the portal root)
  </setting>

  <!-- Number of times in a row a URL will be tolerated to be bad -->
  <setting name="attempts" value="3"/>

  <!-- HTTP timeout in seconds -->
  <setting name="timeout" value="15"/>

  <!-- Comma-separated list of successful HTTP response codes -->
  <setting name="success" value="200"/>
  
  <!-- Default creator/modifier is used as recipient for bad URL -->
  <!-- emails when no other information can be extracted from -->
  <!-- a TKL document -->
  <setting name="creator" value="admin"/>

  <!-- Remove the comments if the tkl-urlcheck component should not -->
  <!-- send any mails -->
  <!--
  <setting name="nomail"/>
  -->

</config>


  
=head1 AUTHOR

Anders Snderberg Mortensen <sondberg@indexdata.dk>
Indexdata, Copenhagen, Denmark
2003/10/06

=head1 SEE-ALSO

Man-pages for the various TKL:: packages, escpecially TKL::URLcheck.

=cut
