#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell


use TKL::OAI;
use TKL::Task;
use TKL::File;
use XML::LibXML;
use XML::LibXSLT;

use strict;

my $debug = TKL::Task::getverbose();
#my $debug = 1;

# file scope configuration variables
my $oai_spool_dir = TKL::Task::getspooldir();
my $oai_pid_file =  TKL::Task::getpidfile();
my $oai_log_file =  TKL::Task::getlogfile();
my $oai_sleep_daemon = 31;
my $oai_sleep_reconnect = 10;
my $oai_file_prefix_default = "link-";	## This is the default file prefix
my $oai_file_prefix;
my $oai_file_suffix = ".tkl";
my $oai_default_handler = "do_nothing.handler";


# sorry, needed globally in file scope since handler does not allows otherwise
my $oai_target_dir;
my $style_xslt;


# start up the daemon
oai_daemon($oai_spool_dir, $oai_pid_file, $oai_log_file);

# all function definitions below here

sub oai_daemon {
    my ($oai_spool_dir, $oai_pid_file, $oai_log_file) = @_;

    # date and pid come in automatically!
    TKL::Task::logf("oai_daemon(): tkl-oai harvester started");
    TKL::Task::set_pid_file();

    TKL::Task::logf("oai_daemon(): \$oai_pid_file = ".$oai_pid_file);
    TKL::Task::logf("oai_daemon(): \$oai_log_file = ".$oai_log_file);
    TKL::Task::logf("oai_daemon(): \$oai_spool_dir = ".$oai_spool_dir);
    
    while (1){
	TKL::Task::debug("oai_daemon(): getting tasks");
	  oai_parse_task_files();
	  sleep($oai_sleep_daemon);
      }
}

sub oai_parse_task_files {
    my ($oai_spool_dir) = @_;
    TKL::Task::debug("oai_parse_task_files(): getting task lists");

    my @filelist =  TKL::Task::gettasklist();
    #TKL::Task::debug("oai_parse_task_files(): got task lists");

    foreach my $f ( @filelist )
    {
	oai_process_task($f);
    }
}

sub oai_process_task {
    my ($tkl_file) = @_;
    TKL::Task::debug("oai_process_task(): processing $tkl_file");

    # pre-declaring needed variables
    my $spoolfile;
    my $fn;

    # catching all possible errors in an eval statement
    eval {
	$spoolfile = $tkl_file->{spoolfile};

	$fn = $tkl_file->{file};

	# checking several things. The possible outcome here is :
	# return;   do not process further, but do not remove spoolfile either
	# die;         stop by exception and remove spoolfile
	# nothing      keep on processing
	my %task_info = $tkl_file->verify_task();
	if ( !%task_info ) 
	{
	    TKL::Task::logf("oai_process_task: verification of task $fn failed"
			    .". Skipping");
	      #oai_remove_spoolfile($spoolfile);	
	      return;
	  }
	if ($task_info{"type"} ne "oai" )
	{
	    TKL::Task::debug("oai_process_task: task $fn is not oai but a "
			     .$task_info{"type"}.". Skipping");
	      return;
	  }
	if ($task_info{"status"} eq "runnning")
	{
	    TKL::Task::logf("oai_process_task: error: task $fn has status "
			    .$task_info{"status"}
			    ."This indicated that the tkl-oai daemon had been "
			    ."stopped");
	      oai_remove_spoolfile($spoolfile);	
	      die;
	  }
	if ($task_info{"status"} ne "pending")
	{
	    TKL::Task::debug("oai_process_task: task $fn has status "
			     .$task_info{"status"}." Skipping");
	      oai_remove_spoolfile($spoolfile);	
	      return;
	  }
	
	if (!$task_info{"xslt"})
	{
	    TKL::Task::logf("oai_process_task: task $fn has no xslt stylesheet "
			     .$task_info{"xslt"}." Skipping");
	      oai_remove_spoolfile($spoolfile);	
	      return;
	  } 

	
	# Now process valid tasks!
	TKL::Task::debug("oai_process_task: processing valid task $fn");

	$tkl_file->change_task_status("running");
	

	# compute target directory and set global variable
	$oai_target_dir=$task_info{"targetdir"};
	TKL::Task::logf("oai_process_task: running task $fn, target dir "
			.$task_info{"targetdir"});
	TKL::Task::logf("oai_process_task: url " .$task_info{"url"});

	# get all records - do the work
	if (length($task_info{"prefix"})) {
	    $oai_file_prefix = $task_info{"prefix"};
	} else {
	    $oai_file_prefix = $oai_file_prefix_default;
	}
	
	oai_get_records($task_info{"url"}, $task_info{"xslt"}, $oai_target_dir, $task_info{"set"});
	$tkl_file->change_task_status("finished");

	# remove spool file after finishing tasks!
	oai_remove_spoolfile($spoolfile);	

	foreach (keys %task_info) {
	    TKL::Task::logf("oai_process_task: Task info: [$_] => [" . $task_info{$_} . "]");
	}

	my $handler = $task_info{"handler"};

	if ($handler =~ /\Q$oai_default_handler\E$/) {
	    $handler = "";
	}

	if ($handler) {
	    ## If a handler is associated with the OAI task
	    ## the responsibility for indexing is passed to this handler
	    
	    chomp(my $old_dir = `pwd`);
	    chdir($tkl_file->{root});

	    my $target_path = $task_info{target};
	    $target_path =~ s/\/*//;
	    
	    my @handler_cmd_array = ($handler, $target_path);
	    my $handler_cmd_text = join(" ", @handler_cmd_array);
	    if ( system( @handler_cmd_array ) == 0 ) {
		TKL::Task::logf("oai_process_task: Executed $handler_cmd_text");
	    } else {
		TKL::Task::logf("oai_process_task: error: failed $handler_cmd_text");
	    }

	    chdir($old_dir);
	} else {
	    # indexing harvested files - chopping of leading slash from subdir
	    # #`/etc/init.d/tkl index $tkl_file->{root} $task_info{target}`;

	    my $subdir = $task_info{target};
	    $subdir =~ s@^/@@;
	    my @indexargs = ("/etc/init.d/tkl", "index", "$tkl_file->{root}", "$subdir");
	    my $indexmsg = join(' ', @indexargs);
	    if ( system(@indexargs) == 0 ) {
		TKL::Task::logf("oai_process_task: $indexmsg");
	    } else {
		TKL::Task::logf("oai_process_task: error: failed $indexmsg");
	    }
	}
    }; # end eval - now catch errors and clean up!

    if ( $@ ) {
	TKL::Task::logf("oai_process_task(): error: failed task $fn");
	  TKL::Task::logf("oai_process_task(): error: message is $@");
	  $tkl_file->change_task_status("error");	
	  oai_remove_spoolfile($spoolfile);	  
	  return;
      } 
}

sub oai_remove_spoolfile {
    my ($spoolfile) = @_;
    # any errors which mandate removing of spoolfile are shut down here 
    if ( unlink ($spoolfile)) {
	TKL::Task::debug("oai_remove_spoolfile: "
			 ."removed spool file $spoolfile");
      } 
    else {
	TKL::Task::debug("oai_remove_spoolfile: "
			 ."could not remove spool file $spoolfile");
      }
}


sub oai_get_records {
    my ($oai_url, $stylesheet, $oai_target, $set) = @_;
    my $oai = new TKL::OAI;
    
    # compile stylesheet to xslt machine
    TKL::Task::debug("oai_get_records: stylesheet $stylesheet");

    eval {
	# compile stylesheet and set global variable
	$style_xslt = oai_parse_xslt($stylesheet);
    };
    if ( $@ ) {
	TKL::Task::logf("oai_get_records(): error: "
			."not found valid stylesheet $stylesheet");
 	  die;
      }

    $oai->oai_option(url => $oai_url);
    $oai->set_callback(record => \&oai_handle_record);

    TKL::Task::debug("oai_get_records(): getting records");
    
    # getting oai records
    eval {
	my %oai_req = (verb => 'ListRecords', metadataPrefix => 'oai_dc');

	if (length($set)) {
	    $oai_req{"set"} = $set;
	}
	
	my $r = $oai->oai_request(%oai_req);

	while ($r->is_success) {
	    last unless my $oai_resumption_token = $oai->incomplete;
	    TKL::Task::debug("oai_get_records(): document incomplete, "
			     ."fetched ".$oai->harvested . "/"
			     . $oai->completeListSize); 
	    
	    TKL::Task::debug("oai_get_records(): reconnecting with token "
			     . $oai_resumption_token);
	    $r = $oai->oai_request;
	    sleep($oai_sleep_reconnect);
	}
	if (TKL::Task::getverbose()) {
	    TKL::Task::debug("oai_get_records(): connection error "
			     .$r->status_line) 
		unless $r->is_success;
	  }
    };
    if ( $@ ) {
	TKL::Task::logf("oai_get_records(): error: "
			."could not get records from $oai_url");
 	  die;
      }

}

sub open_mode_utf8 {
    # use open ':utf8' ; #does not work in perl 5.61 from debian stable
    my ($mode) = @_ ;
    $mode = $mode . ":utf8" if ($] >= 5.008);
    return $mode;
}


sub oai_handle_record {
    my ($oai_obj, $raw_xml) = @_;

    my $oaifile = oai_next_file($oai_target_dir);
    my $date=localtime();

    # make sure that we use uft8 output 
    # my $openmode =  open_mode_utf8(">");
    # open(OAIFILE, $openmode, $oaifile) or return;
    open(OAIFILE, ">", $oaifile) or return;

    # insert time of creation
    $raw_xml =~ s/\>\</\>\n\</g;

    eval {
	# using global variable $style_xslt
	print OAIFILE oai_xslt_transform($style_xslt, $raw_xml);
    };
    if ( $@ ) {
	TKL::Task::debug("oai_handle_record: problem with $oaifile : $@\n");
	close(OAIFILE);
	unlink($oaifile);
    } else {
	close(OAIFILE);
    }
}


sub oai_parse_xslt {
    my ($stylesheet) = @_;

    # initialise xslt transform engine and set options
    my $lib_xslt = new XML::LibXSLT;

    # eval { my $style_dom = $lib_xml->parse_file($stylesheet); };
    #if ( $@ ) {
    #	TKL::Task::debug("oai_parse_xslt: XML::LibXML "
    #    ."exception parse_file $@\n");
    #	die; };

    #eval { my $style_xslt = $lib_xslt->parse_stylesheet($style_dom); };
    #if ( $@ ) {
    #	TKL::Task::debug("oai_parse_xslt: XML::LibXSLT exception "
    #    ."parse_stylesheet $@\n");
    #	die; };

    eval {
	$style_xslt = $lib_xslt->parse_stylesheet_file($stylesheet);
    };
    if ( $@ ) {
	TKL::Task::logf("oai_parse_xslt: XML::LibXSLT exception "
			."parse_stylesheet_file $@\n");
	  die; };
    
    # everything ok, return compiled stylesheet object
    # TKL::Task::debug("oai_parse_xslt: completed ok");
    return $style_xslt;
}


sub oai_xslt_transform {
    my ($style_xslt, $xml) = @_;

    # initialise xml dom parser and set options
    my $lib_xml = new XML::LibXML;
    # don't chocke on misformed xml - try to get the best out of it
    # $lib_xml->recover(1); # does not work in debian stable!

    my $xml_dom;
    
    eval { 
	$xml_dom =$lib_xml->parse_string($xml); 
    };
    if ( $@ ) {
	TKL::Task::logf("oai_xslt_transform: "
			."XML::LibXML exception parse_string $@\n");
	  die; };
    
    # now process output xml tree
    my $result;
    eval {
	$result = $style_xslt->transform($xml_dom);
    };
    if ( $@ ) {
	TKL::Task::logf("oai_xslt_transform: "
			."XML::LibXSLT exception transform $@\n");
	  die; };
    
    # everything went ok, send output
    # TKL::Task::debug("oai_xslt_transform: completed ok");
    return $style_xslt->output_string($result);    
}




sub oai_transform_2_link {
    my ($xml) = @_;
    # change metadata to link
    $xml =~ s/\<metadata/\<link/g;    
    $xml =~ s/\<\/metadata\>/\<\/link\>/g;
    # take all namespaces out    
    $xml =~ s/\<oai_dc:dc.*\>//g;    
    $xml =~ s/\<\/oai_dc:dc\>//g;    
    # make sure only one title survives!
    $xml =~ s/\<dc:title/\<title/g;    
    $xml =~ s/\<\/dc:title\>/\<\/title\>/g;    
    # normal substitutions again
    $xml =~ s/\<dc:creator/\<creator/g;    
    $xml =~ s/\<\/dc:creator\>/\<\/creator\>/g;    
    $xml =~ s/\<dc:subject/\<subject/g;    
    $xml =~ s/\<\/dc:subject\>/\<\/subject\>/g;    
    $xml =~ s/\<dc:description/\<description/g;    
    $xml =~ s/\<\/dc:description\>/\<\/description\>/g;    
    $xml =~ s/\<dc:publisher/\<publisher/g;    
    $xml =~ s/\<\/dc:publisher\>/\<\/publisher\>/g;    
    $xml =~ s/\<dc:contributor/\<contributor/g;    
    $xml =~ s/\<\/dc:contributor\>/\<\/contributor\>/g;    
    $xml =~ s/\<dc:date/\<date/g;    
    $xml =~ s/\<\/dc:date\>/\<\/date\>/g;    
    $xml =~ s/\<dc:type/\<type/g;    
    $xml =~ s/\<\/dc:type\>/\<\/type\>/g;    
    $xml =~ s/\<dc:format/\<format/g;    
    $xml =~ s/\<\/dc:format\>/\<\/format\>/g;    
    $xml =~ s/\<dc:identifier/\<identifier/g;    
    $xml =~ s/\<\/dc:identifier\>/\<\/identifier\>/g;    
    $xml =~ s/\<dc:source/\<source/g;    
    $xml =~ s/\<\/dc:source\>/\<\/source\>/g;    
    $xml =~ s/\<dc:language/\<language/g;    
    $xml =~ s/\<\/dc:language\>/\<\/language\>/g;    
    $xml =~ s/\<dc:relation/\<relation/g;    
    $xml =~ s/\<\/dc:relation\>/\<\/relation\>/g;    
    $xml =~ s/\<dc:coverage/\<coverage/g;    
    $xml =~ s/\<\/dc:coverage\>/\<\/coverage\>/g;    
    $xml =~ s/\<dc:rights/\<rights/g;    
    $xml =~ s/\<\/dc:rights\>/\<\/rights\>/g;
    # take multiple whitespaces
    $xml =~ s/ +/ /g;
    #TKL::Task::debug("oai_transform_2_link(): called");
    return $xml;
}

sub oai_next_file {
    my ($oai_target_dir) = @_;
    my $mask = $oai_file_prefix ."*" . $oai_file_suffix;
    my $number = oai_max_file_number($oai_target_dir, $mask);
    $number++;
    my $oai_target_path = $oai_target_dir . "/" 
	. $oai_file_prefix . $number . $oai_file_suffix; 
    #TKL::Task::debug("oai_next_file(): " . $oai_target_path);
    return $oai_target_path;
}


sub oai_max_file_number {
    my $dir=shift;  # dir path where to look for
    my $mask=shift;    # filename mask 'oai*.tkl'
    my $max=0;
    #TKL::Task::debug( "oai_max_file_number(): '" . $dir);
    #TKL::Task::debug( "oai_max_file_number(): '" . $mask);
    for (  <$dir/$mask> )
    {
	if ( /(\d+)\D+$/ && $1  > $max ) { $max=$1 }
	#TKL::Task::debug("oai_max_file_number(): $_ : $1");
    }
    return $max;
}

