#!/usr/bin/tclsh 
#   Copyright (C) 2002-2004 Index Data Aps, www.indexdata.dk
#
#   This file is part of TKL.
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; version 2 dated June, 1991.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   A copy of the GNU General Public License is also available at
#   <URL:http://www.gnu.org/copyleft/gpl.html>.  You may also obtain
#   it by writing to the Free Software Foundation, Inc., 59 Temple
#   Place - Suite 330, Boston, MA 02111-1307, USA.
#
#   $Id: tkl-web-harvester,v 1.60 2004/06/08 13:32:57 adam Exp $

set loghandle stdout
set robotsRunning 0
set workdir [pwd]
# 30 seconds in miliseconds !! 
set idletime 30000  
set acceptLanguage {}
set debuglevel -1
set libdir ""
set clockseconds [clock seconds]

proc logmsg {msg} {
    global loghandle
    global clockseconds
    set clockseconds [clock seconds]
    #set humantime [clock format $clockseconds]
    #set humantime [clock format $clockseconds -format %c]
    set humantime [clock format $clockseconds -format "%Y-%m-%e %T"]
    puts $loghandle "$humantime $msg"
    flush $loghandle
}

proc dbgmsg {lvl msg} {
    global debuglevel
    if {[expr $debuglevel >= $lvl]} {
        logmsg $msg
    }
}

proc fnameEncode {fname} {
    return [string map {% %25 & %26 < %3C > %E ? %3F * %2A ' %27} $fname]
}

proc fnameDecode {fname} {
    return [string map {%25 % %26 & %3C < %E > %3F ? %2A * %27 '} $fname]
}

proc RobotFileNext1 {area lead} {
    dbgmsg 7 "RobotFileNext1 area=$area lead=$lead"
    if {[catch {set ns [glob ${area}/*]}]} {
        return {}
    }
    foreach n $ns {
	if {[file isfile $n]} {
            set off [string last / $n]
	    # skip /
	    incr off
	    set end [string length $n]
	    # skip _.tkl
	    incr end -6
            return $lead/[string range $n $off $end]
        }
    }
    foreach n $ns {
	if {[file isdirectory $n]} {
            set off [string last / $n]
	    # skip /
	    incr off
            set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
            if {[string length $sb]} {
                return $sb
            }
        }
    }
    return {}
}

proc RobotWriteRecord {outf fromurl distance} {
    puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
    puts $outf "<zmbot>"
    puts $outf "<distance>"
    puts $outf $distance
    puts $outf "</distance>"
    puts $outf "<fromurl>"
    puts $outf $fromurl
    puts $outf "</fromurl>"
    puts $outf "</zmbot>"
}

proc RobotReadRecord {inf fromurlx distancex} {
    upvar $fromurlx fromurl
    upvar $distancex distance
    gets $inf
    gets $inf
    gets $inf
    set distance [string trim [gets $inf]]
    dbgmsg 10 "RobotReadRecord(): distance = $distance"
    gets $inf
    gets $inf
    set fromurl [string trim [gets $inf]]
}

proc RobotFileNext {task area} {
    global control
    global idletime ns
    global status

    dbgmsg 10 "RobotFileNext(): seq=$control($task,seq)"
    if {$control($task,seq) < 0} {
	return {}
    }
    set target $control($task,target)
    if {$control($task,seq) == 0} {
	if {[catch {set ns($task) [glob $target/$area/*]}]} {
	    dbgmsg 5 "RobotFileNext(): done target=$target"
	    return done
	}
    }
    dbgmsg 10 "RobotFileNext(): ns=$ns($task)"
    set off [string length $target/$area]
    incr off
    set n [lindex $ns($task) $control($task,seq)]
    dbgmsg 10 "RobotFileNext(): n=$n"
    if {![string length $n]} {
	set control($task,seq) -1
        set statusfile [open $target/status w]
        puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
        close $statusfile
	return wait
    }
    incr control($task,seq)
    if {[file isfile $n/robots.txt_.tkl]} {
	dbgmsg 10 "RobotFileNext(): ok returning http://[string range $n $off end]/robots.txt"
	return [fnameDecode http://[string range $n $off end]/robots.txt]
    } elseif {[file isdirectory $n]} {
	set sb [RobotFileNext1 $n http://[string range $n $off end]]
	if {[string length $sb]} {
	    return [fnameDecode $sb]
	}
    }
    dbgmsg 5 "RobotFileNext(): no more work at n=$n"
    dbgmsg 5 "RobotFileNext(): ns=$ns($task)"
    return {}
}


proc RobotFileExist {task area host path} {
    global debuglevel control

    dbgmsg 5 "RobotFileExist(): begin area=$area host=$host path=$path"

    set target $control($task,target)
    return [file exists [fnameEncode $target/$area/$host${path}_.tkl]]
}

proc RobotFileUnlink {task area host path} {
    global status control

    set target $control($task,target)
    dbgmsg 10 "RobotFileUnlink(): begin"
    dbgmsg 10 "RobotFileUnlink(): area=$area host=$host path=$path"
    set npath [fnameEncode $target/$area/$host${path}_.tkl]
    dbgmsg 10 "RobotFileUnlink(): npath=$npath"
    set comp [split $npath /]
    if {[catch {exec rm -rf -- $npath}]} return

    set l [llength $comp]
    incr l -2
    incr status($task,$area) -1
    for {set i $l} {$i > 0} {incr i -1} {
        set path [join [lrange $comp 0 $i] /]
	if {![catch {glob $path/*}]} return
        #exec rmdir $path # this one died on hidden directories!
	if {[catch {exec rm -rf -- $path}]} {
	    logmsg "error: can not remove: rm -rf $path"
	    return
	}
    }
    dbgmsg 10 "RobotFileUnlink(): end"
}

proc RobotFileClose {out} {
    if [string compare $out stdout] {
	close $out
    }
}

proc RobotFileOpen {task area host path {mode w}} {
    global URL

    set orgPwd [pwd]
    global workdir status debuglevel control

    dbgmsg 10 "RobotFileOpen(): task=$task path=$path"

    set target $control($task,target)
    set path [fnameEncode $path]

    if {![info exists workdir]} {
	return stdout
    }
    if {$debuglevel > 3} {
        dbgmsg 5 "RobotFileOpen(): orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
    }
    if {[string compare $orgPwd $workdir]} {
        dbgmsg 5 "RobotFileOpen(): error: failed"
	dbgmsg 5 "RobotFileOpen(): workdir = $workdir"
	dbgmsg 5 "RobotFileOpen(): pwd = $orgPwd"
	#exit 1
	return ""
    }

    set comp [split $target/$area/$host /]
    set len [llength $comp]
    incr len -1

    dbgmsg 10 "RobotFileOpen(): 1 comp=$comp"

    for {set i 0} {$i <= $len} {incr i} {
        set d [lindex $comp $i]
	if {[string length $d] == 0} {
	    cd /
	} elseif {[catch {cd $d}]} {

            # exec mkdir $d Might fail as well - do check#
	    if {[catch {exec mkdir -- $d}]} {
		logmsg "error: could not create $d "
		return ""
	    }

            cd ./$d
	    if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
		if {[string compare $path /robots.txt] && ![info exists URL($host,robots)]} {
		    set out [open robots.txt_.tkl w]
		    dbgmsg 5 "RobotFileOpen(): creating robots.txt in $d host=$host"
		    close $out
                    incr status($task,unvisited)
		}
	    }
        }
    }

    set comp [split $path /]
    set len [llength $comp]
    incr len -1

    dbgmsg 10 "RobotFileOpen(): 2 path=$path comp=$comp"

    for {set i 0} {$i < $len} {incr i} {
        set d [lindex $comp $i]
        if {[string length $d] > 0} {
            if {[catch {cd $d}]} {
		# exec mkdir $d Might fail as well - do check#
		if {[catch {exec mkdir -- $d}]} {
		    logmsg "error: could not create $d "
		    return ""
		}
		dbgmsg 5 "RobotFileOpen(): exec mkdir $d"
                cd ./$d
            }
        }
    }

    set d [lindex $comp $len]
    dbgmsg 10 "RobotFileOpen(): 3 d=$d"

    set out ""
    dbgmsg 10 "RobotFileOpen(): 4b out=$out mode=$mode"
    if { [ catch { set out [open ${d}_.tkl $mode ] } ] } {
	dbgmsg 5 "RobotFileOpen(): error: failed open ${d}_.tkl $mode"
	return $out
    } else {
	dbgmsg 5 "RobotFileOpen(): open out=$out mode=$mode"
    }

    if {$mode == "w"} {
        incr status($task,$area)
	dbgmsg 10 "RobotFileOpen(): 5 out=$status($task,$area)"
    }

    cd $orgPwd
    dbgmsg 10 "RobotFileOpen(): 6"
    return $out
}

proc RobotSpoolJob {spoolfile root task} {
    global control

    set fname "$root$task"
    set f [open $fname r]
    set xml [read $f]
    dbgmsg 3 "RobotSpoolJob(): processing $spoolfile"
    dbgmsg 3 "RobotSpoolJob(): reading $fname"
    close $f
    # task type must be "web"
    if {![regexp {<tasktype>([^<]*)</tasktype>} $xml x tasktype]} {
	return
    }
    set tasktype [string trim $tasktype]
    dbgmsg 3 "RobotSpoolJob(): tasktype = $tasktype"
    if {![string match "web" $tasktype]} {
	return
    }

    # status must not be finished or error 
    if {![regexp {<status>([^<]*)</status>} $xml x status]} {
	return
    }
    set status [string trim $status]
    dbgmsg 5 "RobotSpoolJob(): status = $status"
    if {$status == "finished"} {
        dbgmsg 5 "RobotSpoolJob(): already finished"
        return
    }
    if {$status == "error"} {
        dbgmsg 5 "RobotSpoolJob(): already finished due to error"
        return
    }
    if {$status == "collect"} {
        dbgmsg 5 "RobotSpoolJob(): uri's must be collected first"
        return
    }

    # target dir must be set
    if {![regexp {<target>([^<]*)</target>} $xml x targetdir]} {
	dbgmsg 3 "RobotSpoolJob(): target = $targetdir"
	return
    }
    set targetdir [string trim $targetdir]

    # target dir must be directory
    if {![file isdirectory $root/$targetdir]} {
	logmsg "error : target = $targetdir not existent"
	logmsg "error : can not write to $root/$targetdir"
        regsub {<status>[^<]*</status>} $xml {<status>error</status>} xml2
        set f [open $fname w]
        puts -nonewline $f $xml2 
        close $f
	return
    }

    # ignore if task has already been processed
    if {![CreateTask $task]} {
        return
    }
    dbgmsg 5 "RobotSpoolJob(): spool file $spoolfile"
    set control($task,spoolfname) $spoolfile
    dbgmsg 3 "RobotSpoolJob(): processing task $fname"
    set control($task,taskfname) $fname
    dbgmsg 3 "RobotSpoolJob(): processing in portal root $root"
    set control($task,root) $root

    htmlSwitch $xml \
        url {
	    lappend starturls $body
        } filter {
            set type $parm(type)
            set action $parm(action)
            if {$type == "domain"} {
                $action url http://$body/*
            }
            if {$type == "url"} {
                $action url $body
            }
            if {$type == "mime"} {
                $action mime $body
            }
	} target {
	    set ex [file rootname [file tail $task]]
	    #set control($task,target) "$root$body/$ex"
	    set control($task,target) "$control(tmpdir)/$ex"
	    set control($task,subdir) "$body"
	    set control($task,output) "$root/$body"
        } distance {
            set control($task,distance) $body
        } status {
            set control($task,filestatus) $body
        } tasktype {
	    set control($task,tasktype) $body
        }
    dbgmsg 10 "RobotSpoolJob(): control(\$task,target)=$control($task,target)"
    
    if {$status == "pending"} {
	dbgmsg 5 "RobotSpoolJob(): processing spoolfile $spoolfile "
	dbgmsg 5 "RobotSpoolJob(): pending job at $fname "
        regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
        set f [open $fname w]
        puts -nonewline $f $xml2 
        close $f
	logmsg "running job at $fname "
    }

    if {[info exists starturls]} {
	foreach url $starturls {
	    logmsg "$url 000 000"
	    url $url
	}
    }
}

proc RobotDoneJob {task} {
    global daemon_dir control

    if {![info exists daemon_dir]} {
        return
    }
    set fname $control($task,taskfname)
    set f [open $fname r]
    set xml [read $f]
    dbgmsg 1 "RobotDoneJob(): running job at $fname "
    regexp {<status>([^<]*)</status>} $xml x status
    close $f

    regsub {<status>[^<]*</status>} $xml {<status>finished</status>} xml2
    set f [open $fname w]
    puts -nonewline $f $xml2 
    close $f
    logmsg "finished job at $fname "

    # cleaning spool dir up
    set spfile $control($task,spoolfname)
    if { [ catch { file delete "$spfile" } ] } {
	dbgmsg 2 "RobotDoneJob(): spoolfile $spfile already deleted "
    } else {
	dbgmsg 2 "RobotDoneJob(): spoolfile $spfile deleted "
    }

    #cleaning temp dir up
    set tmpdir $control($task,target)
    if { [ catch { file delete -force -- $tmpdir } ] }  { 
	logmsg "error :  could not remove tmpdir $tmpdir"
    } else {
	dbgmsg 2 "RobotDoneJob(): tmpdir $tmpdir removed"
    }

    # indexing harvested ressources - chopping out leading slash from 3d arg
    set rootdir $control($task,root) 
    set subdir ""
    regsub {^\/+} $control($task,subdir) {} subdir
    set indexcommand "/etc/init.d/tkl index $rootdir $subdir"
    #if { [ catch { eval exec $indexcommand >& /dev/null } errmsg ] }
    logmsg "start indexing  $indexcommand "
    if { [ catch { eval exec $indexcommand >& /dev/null } ] } {
	#logmsg "error : failed $indexcommand :\n $errmsg"
	logmsg "error : failed $indexcommand "
    } else {
	logmsg "finished indexing  $indexcommand "
    }

    # cleaning variable spaces up 
    unset control($task,distance)
    unset control($task,filestatus)
    unset control($task,output)
    unset control($task,root)
    unset control($task,spoolfname)
    unset control($task,subdir)
    unset control($task,target)
    unset control($task,taskfname)
    unset control($task,tasktype)
}

proc RobotScanDir {} {
    global daemon_dir

    if {![info exists daemon_dir]} {
        return
    }
    dbgmsg 2 "RobotScanDir(): scanning spool $daemon_dir"
    foreach d $daemon_dir {
        if {[catch {set files [glob $d/*.spl]}]} {
            return
        }
        foreach spfname $files {
            if {[file isfile $spfname] && [file readable $spfname]} {
		set jobfile [open $spfname]
		gets $jobfile portalroot
		gets $jobfile portaltask
		close $jobfile
		dbgmsg 3 "RobotScanDir(): spool file $spfname"
		dbgmsg 3 "RobotScanDir(): portal root $portalroot"
		dbgmsg 3 "RobotScanDir(): portal task $portaltask"

                RobotSpoolJob $spfname $portalroot $portaltask
            }
        }
    }
}

proc DumpGlobals {} {
    global loghandle control ns status URL agent
    set f $loghandle
    set full 0
    if {$full} {
        puts $f "DumpGlobals"
    }
    set count 0
    foreach var [info globals] {
        if {$var == "f"} continue
        if {$var == "sessionId"} continue
        if {$var == "errorInfo"} continue
        if {[catch {set names [array names $var]}]} {
	    if {$full} {
                eval "set v \$${var}"
                puts $f "set ${var} \{$v\}"
	    }
	    incr count
        } else {
            foreach n $names {
	        if {$full} {
                    eval "set v \$${var}(\$n)"
                    puts $f "set ${var}($n) \{$v\}"
		}
		incr count
            }
        }
    }
    puts $f "DumpGlobals(): $count"    
    #dbgmsg 0 "DumpGlobals(): $count"    
}

proc RobotRR {task} {
    global control robotsRunning tasks robotsMax status

    dbgmsg 5 "RobotRR(): running=$robotsRunning max=$robotsMax"
    incr robotsRunning -1

    # only one task gets through...
    if {[string compare [lindex $tasks 0] $task]} {
        return
    }
    dbgmsg 5 "RobotRR(): task = $task"
    while {$robotsRunning} {
	vwait robotsRunning
    }
    #DumpGlobals
    dbgmsg 5 "RobotRR(): scan"
    if {[catch {RobotScanDir} msg]} {
        dbgmsg 3 "RobotRR(): RobotScanDir failed"
        dbgmsg 3 "RobotRR(): $msg"
    }
    foreach t $tasks {
	set target $control($t,target)
        set statusfile [open $target/status w]
        puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
        close $statusfile
        set control($t,seq) 0
        RobotStart $t
    }
}

proc RobotDaemonSig {} {
    global daemon_cnt

    incr daemon_cnt
}

proc RobotDaemonLoop {} {
    global daemon_cnt tasks robotsRunning status

    set daemon_cnt 0
    while 1 {
        dbgmsg 5 "RobotDaemonLoop(): daemon loop count $daemon_cnt"
        RobotScanDir
        
        if {[info exists tasks]} {
            dbgmsg 5 "RobotDaemonLoop(): tasks $tasks"
            foreach t $tasks {
                set control($t,seq) 0
                RobotStart $t
            }
            while {$robotsRunning} {
                vwait robotsRunning
            }
        }
        after 30000 RobotDaemonSig
        vwait daemon_cnt
    }
}

proc RobotRestart {task url sock} {
    global URL robotsRunning
    dbgmsg 5 "RobotRestart(): restarting"

    close $sock
    after cancel $URL($sock,cancel) 

    array unset URL $task,$url,*

    incr robotsRunning -1
    RobotStart $task
}

proc RobotStart {task} {
    global URL
    global robotsRunning robotsMax idletime status tasks
  
    dbgmsg 5 "RobotStart(): $task running=$robotsRunning"
    while {1} {
        set url [RobotFileNext $task unvisited]
	if {[string compare $url done] == 0} {
            dbgmsg 5 "RobotStart(): task $task done"

            catch {unset ntasks}
            foreach t $tasks {
                if {[string compare $t $task]} {
                    lappend ntasks $t
                } else {
                    dbgmsg 5 "RobotStart(): task $t done"
                }
            }
            if {![info exists ntasks]} {
                unset tasks
                dbgmsg 5 "RobotStart(): all done"
            } else {
                set tasks $ntasks
            }
            RobotDoneJob $task
	    return
	}
        if {![string length $url]} {
	    return
	}
        incr robotsRunning
	if {[string compare $url wait] == 0} {
            after $idletime [list RobotRR $task]
            return
	}
        set r [RobotGetUrl $task $url {}]
        if {!$r} {
	    if {$robotsRunning >= $robotsMax} return
        } else {
	    incr robotsRunning -1
	    if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
		dbgmsg 5 "RobotStart(): before RobotFileOpen"
		set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
		dbgmsg 5 "RobotStart(): after RobotFileOpen"
		RobotFileClose $outf
		dbgmsg 5 "RobotStart(): after RobotFileClose"
	    }
            RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
	    array unset URL $task,$url,*
	}
    }
}

proc headSave {task url out} {
    global URL
    
    if {[info exists URL($task,$url,head,last-modified)]} {
        puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
    }
    puts $out {<si>}
    if {[info exists URL($task,$url,head,date)]} {
        puts $out " <date>$URL($task,$url,head,date)</date>"
    }
    if {[info exists URL($task,$url,head,content-length)]} {
        puts $out " <by>$URL($task,$url,head,content-length)</by>"
    }
    if {[info exists URL($task,$url,head,server)]} {
        puts $out " <format>$URL($task,$url,head,server)</format>"
    }
    puts $out {</si>}
    puts $out {<publisher>}
    set wurl [wellform $url]
    puts $out " <identifier>$wurl</identifier>"
    if {[info exists URL($task,$url,head,content-type)]} {
        puts $out " <type>$URL($task,$url,head,content-type)</type>"
    }
    puts $out {</publisher>}
}

proc RobotHref {task url hrefx hostx pathx} {
    global URL control debuglevel
    upvar $hrefx href
    upvar $hostx host
    upvar $pathx path

    dbgmsg 5 "RobotHref(): ref input url = $url href=$href"

    if {[string first { } $href] >= 0} {
	return 0
    }
    if {[string length $href] > 128} {
	return 0
    }

#   Skip pages that have ? in them
    if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
	return 0
    }
    # get method (if any)
    if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
	set hpath $href
	set method http
    } else {
	if {[string compare $method http]} {
	    return 0
	}
    }
    # get host (if any)
    if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
        if {[info exist control($task,domains)]} {
	    set ok 0
	    foreach domain $control($task,domains) {
	        if {[string match $domain $host]} {
		    set ok 1
		    break
                }
	    }
	    if {!$ok} {
	        return 0
	    }
        }
    } else {
	regexp {^([^\#]*)} $hpath x surl
	set host $URL($task,$url,hostport)
    }
    if {![regexp {^[a-zA-Z0-9].*} $host]} {
	return 0
    }
    if {![string length $surl]} {
	return 0
    }
    if {[string first / $surl]} {
	# relative path
        set curpath $URL($task,$url,path)
        if {[info exists URL($task,$url,bpath)]} {
            set curpath $URL($task,$url,bpath)
        }
	regexp {^([^\#?]*)} $curpath x dpart
	set l [string last / $dpart]
	if {[expr $l >= 0]} {
	    set surl [string range $dpart 0 $l]$surl
	} else {
	    set surl $dpart/$surl
	}
    }
    set surllist [split $surl /]
    catch {unset path}
    set pathl 0
    foreach c $surllist {
        switch -- $c {
	    .. {
 		if {$pathl > 1} {
		    incr pathl -2
		    set path [lrange $path 0 $pathl]
		    incr pathl
		}
	    }
            . {

            }
            default {
		incr pathl
                lappend path [string trimleft $c .]
	    }
	}
    }
    dbgmsg 7 "RobotHref(): pathl=$pathl output path=$path"
    set path [join $path /]
    if {![string length $path]} {
	set path /
    }
    regsub -all {~} $path {%7E} path
    set href "$method://$host$path"

    dbgmsg 5 "RobotHref(): ref result = $href"
    return [checkrule $task url $href]
}

proc RobotError {task url code} {
    global URL

    dbgmsg 5 "RobotError(): bad URL $url (code $code)"
    set fromurl {}
    set distance -1
    if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
	dbgmsg 5 "RobotError(): before unvisited RobotFileOpen"
	set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
	dbgmsg 5 "RobotError(): before unvisited RobotReadRecord"
	RobotReadRecord $inf fromurl distance
	dbgmsg 5 "RobotError(): before unvisited RobotFileClose"
	RobotFileClose $inf
	dbgmsg 5 "RobotError(): after unvisited RobotFileClose"
    }
    RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
    if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
	dbgmsg 5 "RobotError(): before bad RobotFileOpen"
	set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
	dbgmsg 5 "RobotError(): before bad RobotWriteRecord"
	RobotWriteRecord $outf $fromurl $distance
	dbgmsg 5 "RobotError(): before bad RobotFileClose"
	RobotFileClose $outf
	dbgmsg 5 "RobotError(): after bad RobotFileClose"
    }
}

proc RobotRedirect {task url tourl code} {
    global URL

    dbgmsg 5 "RobotRedirect(): redirecting from $url to $tourl"

    set distance {}
    set fromurl {}
    if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {

	dbgmsg 5 "RobotRedirect(): before unvisited RobotFileOpen"
	set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
	dbgmsg 5 "RobotRedirect(): before unvisited RobotReadRecord"
	RobotReadRecord $inf fromurl distance
	dbgmsg 5 "RobotRedirect(): before unvisited RobotFileClose"
	RobotFileClose $inf
	dbgmsg 5 "RobotRedirect(): after unvisited RobotFileClose"
    }
    if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
	set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
	RobotWriteRecord $outf $fromurl $distance
	RobotFileClose $outf
    }
    if {[RobotHref $task $url tourl host path]} {
	if {![RobotFileExist $task visited $host $path]} {
	    if {![RobotFileExist $task unvisited $host $path]} {
		set outf [RobotFileOpen $task unvisited $host $path]
		RobotWriteRecord $outf $fromurl $distance
		RobotFileClose $outf
	    }
	} else {
	    set olddistance {}
	    set inf [RobotFileOpen $task visited $host $path r]
	    RobotReadRecord $inf oldurl olddistance
	    RobotFileClose $inf
	    if {[string length $olddistance] == 0} {
		set olddistance 1000
	    }
	    if {[string length $distance] == 0} {
		set distance 1000
	    }
	    dbgmsg 5 "RobotRedirect(): distance=$distance olddistance=$olddistance"
	    if {[expr $distance < $olddistance]} {
		set outf [RobotFileOpen $task unvisited $host $path]
		RobotWriteRecord $outf $tourl $distance
		RobotFileClose $outf
	    }
	}
    }
    if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
        dbgmsg 5 "RobotRedirect(): unlink failed"
        exit 1
    }
}

proc wellform {body} {
    regsub -all {\n+} $body { } abody
    regsub -all {\t+} $abody { } body
    regsub -all {\r+} $body { } abody

    regsub -all -nocase {<script[^<]*</script>} $abody {} body
    #regsub -all -nocase {<script.*?</script>} $abody { } body
    regsub -all {<!--[^-]*-->} $body { } abody
    #regsub -all {<!--.*?-->} $body { } abody
    regsub -all {<[^\>]+>} $abody {} body
    #regsub -all {<.+?>} $abody { } body

    regsub -all {<} $body { } abody
    regsub -all {>} $abody { } body
    regsub -all {[\0-\37]} $body { } abody
    regsub -all {&nbsp;} $abody { } body
    regsub -all {&copy;} $body {} abody
    regsub -all {&aelig;} $abody {} body
    regsub -all {&AElig;} $body {} abody
    regsub -all {&oslash;} $abody {} body
    regsub -all {&Oslash;} $body {} abody
    regsub -all {&aring;} $abody {} body
    regsub -all {&Aring;} $body {} abody
    regsub -all {&} $abody {&amp;} body
    regsub -all { +} $body { } abody
    regsub -all {^ } $abody {} body
    regsub -all { $} $body {} abody
    return $abody
}

proc link {task url out href body distance} {
    global URL control
    if {[expr $distance > $control($task,distance)]} return
    
    if {![RobotHref $task $url href host path]} return
    
    if ($control($task,cr)) {
	puts $out "<cr>"
	set whref [wellform $href]
	puts $out "<identifier>$whref</identifier>"
	set abody [wellform $body]
	puts $out "<description>$abody</description>"
	puts $out "</cr>"
    }
    
    if {![RobotFileExist $task visited $host $path]} {
        set olddistance 1000
        if {![RobotFileExist $task bad $host $path]} {
            if {[RobotFileExist $task unvisited $host $path]} {
                set inf [RobotFileOpen $task unvisited $host $path r]
                RobotReadRecord $inf oldurl olddistance
                RobotFileClose $inf
            }
        } else {
            set olddistance 0
        }
        if {[string length $olddistance] == 0} {
            set olddistance 1000
        }
        if {[expr $distance < $olddistance]} {
            set outf [RobotFileOpen $task unvisited $host $path]
            RobotWriteRecord $outf $url $distance
            RobotFileClose $outf
        }
    } elseif {[string compare $href $url]} {
        set inf [RobotFileOpen $task visited $host $path r]
        RobotReadRecord $inf xurl olddistance
        close $inf
        if {[string length $olddistance] == 0} {
            set olddistance 1000
        }
        if {[expr $distance < $olddistance]} {
            dbgmsg 5 "link(): OK remarking url=$url href=$href"
            dbgmsg 5 "link(): olddistance = $olddistance"
            dbgmsg 5 "newdistance = $distance"
            set outf [RobotFileOpen $task unvisited $host $path]
            RobotWriteRecord $outf $url $distance
            RobotFileClose $outf
        }
    }
}

proc GenerateTKLDir {task url} {
    global URL control
    set dirname "$control($task,output)"
    set subdirname "/$URL($task,$url,hostport)"
    #set control($task,tkloutdir) "$dirname$subdirname"
    set tkloutdir "$dirname$subdirname"
    dbgmsg 3 "GenerateTKLDir(): tkloutdir=$tkloutdir"
    if { [ catch {file mkdir $tkloutdir } ] } {
	logmsg "error : could not create $tkloutdir"
	return ""
    }
    # write index.tkl file
    set xml "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
    #set xml "<?xml version=\"1.0\" encoding=\"iso-8859-1\" standalone=\"yes\"?>\n"
    set xml "$xml<subject creator=\"tkl-web-harvester\">\n"
    set subdirname2 ""
    regsub -all {/} $subdirname {} subdirname2  
    set subdirname2 [wellform $subdirname2]
    set xml "$xml  <title>$subdirname2</title>\n"
    set xml "$xml</subject>\n"

    if { [ catch { set tklout [open "$tkloutdir/index.tkl" w] } ] } {
	logmsg "error : could not create $tkloutdir/index.tkl"
	return ""
    } else {
	# does not work in tcl8.3 -sorry
	fconfigure $tklout -encoding utf-8 
	dbgmsg 10 "GenerateTKLDir(): xml = $xml" 
	puts $tklout $xml
	close $tklout
	# return value
	return $tkloutdir
    }
}

proc GenerateTKLFileName {task url} {
    global URL control
    set tkloutdir [GenerateTKLDir $task $url]
    set pathcoding  $URL($task,$url,path)
    set pathcoding2 ""
    regsub -all {[^a-zA-Z0-9-]} $pathcoding {_} pathcoding2 
    #regsub -all {/} $pathcoding {_} pathcoding2 
    #regsub -all {\?} $pathcoding2 {_} pathcoding
    #regsub -all {\.} $pathcoding {_} pathcoding2
    set tklfilename "$tkloutdir/link-$pathcoding2.tkl"
    dbgmsg 4 "GenerateTKLFileName(): tklfilename=$tklfilename"
    return $tklfilename
}

proc RobotTextTkl {task url out} {
    global URL control clockseconds

    set tklfilename [GenerateTKLFileName $task $url]
    dbgmsg 5 "RobotTextTKL(): tklfilename=$tklfilename"


    # set title so we can emit it for the body
    set title ""
    # if true, nothing will be indexed
    #set noindex 1
    # if true, nothing will be followed
    #set nofollow 0

    dbgmsg 5 "RobotTextTkl(): task $task url $url"

    set xml ""
    set xml "$xml<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
    #set xml "$xml <?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
    set humandate [clock format $clockseconds -format %Y-%m-%e]
    set xml "$xml <link origin=\"web\" creator=\"tkl-web-harvester\" created=\"$humandate\">\n"

    set wurl [wellform $url]
    set xml "$xml    <identifier>$wurl</identifier>\n"
    set title_ok 0
    set description_ok 0

    htmlSwitch $URL($task,$url,buf) \
        title {
            # nr title tag er hittet, er body set til indholdet af tagget
            set title [wellform $body]
	    if { $title_ok == 0 &&  $title != "" } {
		set xml "$xml    <title>$title</title>\n"
		set title_ok 1
	    }
        } -nonest meta {
            # al er list med attribut navne som fandtes ind i parm hash
	    set al [array names parm]  
            # lkke igennem attributer
            foreach a $al {
                set element [string tolower $a]
                switch -- $element {
                    "name" {
                        set metaname [string tolower $parm($a)]
                    }
                    "http-equiv" {
                        set metaname [string tolower $parm($a)]
                    }
                    "content" {
                        set metacontent [wellform $parm($a)]
                    }
                }
	 	unset parm($a)
            }

	    if {[info exists metaname]  && [info exists metacontent]} {
                switch -- $metaname {
                    "title" {
			if { $title_ok == 0 &&  $metacontent != "" } {
			    set xml "$xml    <title>$metacontent</title>\n"
			    set title_ok 1
			}
	            }
                    "dc.title" {
			if { $title_ok == 0 &&  $metacontent != "" } {
			    set xml "$xml    <title>$metacontent</title>\n" 
			    set title_ok 1
			}
                    }
                    "description" {
			set xml "$xml    <description>$metacontent</description>\n" 
			set description_ok 1
                    }
                    "dc.description" {
			set xml "$xml    <description>$metacontent</description>\n" 
			set description_ok 1
                    }
                    "dc.identifier" {
			#set xml "$xml    <identifier>$metacontent</identifier>\n" 
                    }
                    "dc.language" {
			set xml "$xml    <language>$metacontent</language>\n" 
                    }
                    "content-language" {
			set xml "$xml    <language>$metacontent</language>\n" 
		    }
                    "dc.subject" {
			set xml "$xml    <subject>$metacontent</subject>\n" 
                    }
                    "keywords" {
			set xml "$xml    <subject>$metacontent</subject>\n" 
                    }
                    "dc.creator" {
			set xml "$xml    <creator>$metacontent</creator>\n" 
                    }
                    "author" {
			set xml "$xml    <creator>$metacontent</creator>\n" 
                    }
                    "dc.publisher" {
			set xml "$xml    <publisher>$metacontent</publisher>\n" 
                    }
                    "dc.rights" {
			set xml "$xml    <rights>$metacontent</rights>\n" 
                    }
                    "copyright" {
			set xml "$xml    <rights>$metacontent</rights>\n" 
                    }
                    "dc.date" {
			set xml "$xml    <date>$metacontent</date>\n" 
                    }
                    "dc.format" {
			set xml "$xml    <format>$metacontent</format>\n" 
                    }
                    "content-type" {
			set xml "$xml    <format>$metacontent</format>\n" 
                    }
                }
	    }

	    catch {unset metaname}
	    catch {unset metacontent}
	} body {
            set bodytext [wellform $body]
	    set xml "$xml    <text>$bodytext</text>\n"
	    if { ! $description_ok } {
		#set shorttext [string range $bodytext 0 200]
		set shorttext [wellform [string range $bodytext 0 200]]
		set xml "$xml    <description>$shorttext</description>\n"
		set description_ok 1
	    }
	    catch {unset shorttext}
	    catch {unset bodytext}
	}
 
    #set xml "$xml    <harvest strategy=\"star\" steps=\"0\">no</harvest>\n"
    set xml "$xml    <harvest>no</harvest>\n"
    set xml "$xml </link>\n"

    if { $title_ok && $description_ok } {
	set tklout [open $tklfilename w]
	fconfigure $tklout -encoding utf-8 
	puts $tklout $xml
	close $tklout
    }
}

proc RobotTextHtml {task url out} {
    global URL control

    # set title so we can emit it for the body
    set title {}
    # if true, nothing will be indexed
    set noindex 0
    # if true, nothing will be followed
    set nofollow 0

    set distance 0
    set fdistance 0
    if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
        set fdistance $URL($task,$url,dist)
	set distance [expr $fdistance + 1]
    }

    dbgmsg 5 "RobotTextHtml(): task $task url $url"

    htmlSwitch $URL($task,$url,buf) \
        title {
            set title $body
        } -nonest meta {
            # collect metadata and save NAME= CONTENT=..
            set metaname {}
            set metacontent {}
            puts -nonewline $out "<meta"
	    set al [array names parm]
            foreach a $al {
                set al [string tolower $a]
		puts -nonewline $out " $al"
                puts -nonewline $out {="}
                puts -nonewline $out $parm($a)
                puts -nonewline $out {"}
                switch -- $al {
                    "name" {
                        set metaname [string tolower $parm($a)]
                    }
                    "content" {
                        set metacontent $parm($a)
                    }
                }
	 	unset parm($a)
            }
	    puts $out "></meta>"
            # go through robots directives (af any)
            if {![string compare $metaname robots]} {
                set direcs [split [string tolower $metacontent] ,]
                if {[lsearch $direcs noindex] >= 0} {
                    set noindex 1
                }
                if {[lsearch $direcs nofollow] >= 0} {
                    set nofollow 1
                }
            }
	} body {
            # don't print title of document content if noindex is used
            if {!$noindex} {
                puts $out "<title>$title</title>"
		set bbody [wellform $body]
                puts $out "<documentcontent>"
                puts $out $bbody
                puts $out "</documentcontent>"
            }
        } -nonest base {
            # <base href=.. >
            if {![info exists parm(href)]} {
	        continue
            }
            set href [string trim $parm(href)]
            if {![RobotHref $task $url href host path]} continue
            set URL($task,$url,bpath) $path
        } -nonest a {
            # <a href="...."> .. </a> 
            # we're not using nonest - otherwise body isn't set
            if {$nofollow} continue
            if {![info exists parm(href)]} {
	        continue
            }
            link $task $url $out [string trim $parm(href)] $body $distance
        } -nonest area {
            if {$nofollow} continue
            if {![info exists parm(href)]} {
	        continue
            }
            link $task $url $out [string trim $parm(href)] $body $distance
        } -nonest frame {
            if {![info exists parm(src)]} {
	        continue
            }
            link $task $url $out [string trim $parm(src)] $body $fdistance
	}
}

proc RobotsTxt {task url} {
    global agent URL
    dbgmsg 5 "RobotTxt(): task $task url $url"

    RobotsTxt0 $task URL($URL($task,$url,hostport),robots) $URL($task,$url,buf)
}

proc RobotsTxt0 {task v buf} {
    global URL agent
    set section 0
    dbgmsg 5 "RobotTxt0(): task $task"

    if {[info exists $v]} {
	dbgmsg -1 "RobotsTxt0: warning. $v"
    }
    foreach l [split $buf \n] {
	if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
            set arg [string trim $arg]
	    dbgmsg 5 "RobotTxt0(): cmd=$cmd arg=$arg"
	    switch -- [string tolower $cmd] {
		user-agent {
		    if {$section} break
		    set pat [string tolower $arg]*
		    set section [string match $pat $agent]
		}
		disallow {
		    if {$section} {
			dbgmsg 5 "RobotTxt0(): rule [list 0 $arg]"
			lappend $v [list 0 $arg]
		    }
		}
		allow {
		    if {$section} {
			dbgmsg 5 "RobotTxt0(): rule [list 1 $arg]"
			lappend $v [list 1 $arg]
		    }
		}
	    }
	}
    }
    lappend $v [list 1 /]
}

proc RobotTextPlain {task url out} {
    global URL
    dbgmsg 5 "RobotTextPlain(): task $task url $url"

    puts $out "<documentcontent>"
    regsub -all {<} $URL($task,$url,buf) {\&lt;} content
    puts $out $content
    puts $out "</documentcontent>"

    if {![string compare $URL($task,$url,path) /robots.txt]} {
	RobotsTxt $task $url
    }
}

proc RobotWriteMetadata {task url out} {
    global URL
    dbgmsg 5 "RobotWriteMetadata(): task $task url $url"

    set charset $URL($task,$url,charset)
    puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
    puts $out "<zmbot>"

    set distance 1000
    if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
	set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
	RobotReadRecord $inf fromurl distance
	RobotFileClose $inf
    }
    set URL($task,$url,dist) $distance
    puts $out "<distance>"
    puts $out "  $distance"
    puts $out "</distance>"
    headSave $task $url $out
    dbgmsg 5 "RobotWriteMetadata(): Parsing $url distance=$distance"
    switch $URL($task,$url,head,content-type) {
        text/html {
            if {[string length $distance]} {
                RobotTextHtml $task $url $out
		#experimenting with tkl output
                RobotTextTkl $task $url $out
            }
        }
        text/plain {
            RobotTextPlain $task $url $out
        }
    }
    puts $out "</zmbot>"
}

proc Robot200 {task url} {
    global URL
    logmsg "$url 200 999"
    dbgmsg 5 "Robot200(): task $task url $url"
  
    # raw output (disabled for now)
    if {0} {
        set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
        puts -nonewline $out $URL($task,$url,buf)
        RobotFileClose $out
    }

    set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
    RobotWriteMetadata $task $url $out
    RobotFileClose $out

    RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
}

proc RobotReadContent {task url sock binary} {
    global URL
    dbgmsg 2 "RobotReadContent(): task $task url $url"

    set buffer ""

    if { [ catch { set buffer [read $sock 16384] } ] } {
	dbgmsg 0 "error: $url failed read socket $sock 16384"
    }
    
    set readCount [string length $buffer]

    if {$readCount <= 0} {
	Robot200 $task $url
	RobotRestart $task $url $sock
    } elseif {!$binary && [string first \0 $buffer] >= 0} {
	Robot200 $task $url
	RobotRestart $task $url $sock
    } else {
	dbgmsg 10 "RobotReadContent(): got $readCount bytes"
	set URL($task,$url,buf) $URL($task,$url,buf)$buffer
	if {[string length $URL($task,$url,buf)] > 100000} {
	    Robot200 $task $url
	    RobotRestart $task $url $sock
	}
    }
}

proc RobotReadHeader {task url sock} {
    global URL debuglevel

    if {$debuglevel > 1} {
        dbgmsg 5 "RobotReadHeader(): HTTP head $url"
    }
    if {[catch {set buffer [read $sock 2148]}]} {
	RobotError $task $url 404
	RobotRestart $task $url $sock
        return
    }
    set readCount [string length $buffer]
    
    if {$readCount <= 0} {
	RobotError $task $url 404
	RobotRestart $task $url $sock
    } else {
	dbgmsg 10 "RobotReadHeader(): got $readCount bytes"
	set URL($task,$url,buf) $URL($task,$url,buf)$buffer
	
	set n [string first \r\n\r\n $URL($task,$url,buf)]
	if {$n > 1} {
	    set code 0
	    set version {}
	    set headbuf [string range $URL($task,$url,buf) 0 $n]
	    incr n 4
	    set URL($task,$url,charset) ISO-8859-1
	    set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
	    
	    regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
	    set lines [split $headbuf \n]
	    foreach line $lines {
		if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
		    set URL($task,$url,head,[string tolower $name]) [string trim $value]
		}
		regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
	    }
	    dbgmsg 5 "RobotReadHeader(): HTTP CODE $code"
	    dbgmsg 0 "$url $code 000"
	    set URL($task,$url,state) skip
	    switch $code {
		301 {
		    if {[info exists URL($task,$url,head,location)]} {
			RobotRedirect $task $url $URL($task,$url,head,location) 302
		    } else {
			RobotError $task $url $code
		    }
		    RobotRestart $task $url $sock 
		}
		302 {
		    if {[info exists URL($task,$url,head,location)]} {
			RobotRedirect $task $url $URL($task,$url,head,location) 302
		    } else {
			RobotError $task $url $code
		    }
		    RobotRestart $task $url $sock 
		}
		200 {
		    if {![info exists URL($task,$url,head,content-type)]} {
			set URL($task,$url,head,content-type) {}
		    }
		    set binary 1
		    switch -glob -- $URL($task,$url,head,content-type) {
			text/* {
			    set binary 0
			}
		    }
                    if {![regexp {/robots.txt$} $url]} {
                        if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
                            RobotError $task $url mimedeny
                            RobotRestart $task $url $sock
                            return
                        }
                    }
		    # assyncron longjump to RobotReadContent
		    fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
		}
		default {
		    RobotError $task $url $code
		    RobotRestart $task $url $sock
		}
	    }
	}
    }
}

proc RobotSockCancel {task url sock} {

    dbgmsg 5 "RobotSockCancel(): sock=$sock url=$url"
    RobotError $task $url 401
    RobotRestart $task $url $sock
}

proc RobotConnect {task url sock} {
    global URL agent acceptLanguage

    fconfigure $sock -translation {lf crlf} -blocking 0
    # assyncron longjump to RobotReadHeader
    fileevent $sock readable [list RobotReadHeader $task $url $sock]
    puts $sock "GET $URL($task,$url,path) HTTP/1.0"
    puts $sock "Host: $URL($task,$url,host)"
    puts $sock "User-Agent: $agent"
    if {[string length $acceptLanguage]} {
        puts $sock "Accept-Language: $acceptLanguage"
    }
    puts $sock ""
    set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
    if {[catch {flush $sock}]} {
        RobotError $task $url 404
	RobotRestart $task $url $sock
    }
}

proc RobotNop {} {

}

proc RobotGetUrl {task url phost} {
    global URL robotsRunning
    flush stdout
    #logmsg "RobotGetUrl(): url=$url"
    dbgmsg 2 "RobotGetUrl(): running=$robotsRunning url=$url task=$task"    

    if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
        return -1
    }
    if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
	set port 80
	set host $hostport
    }
    set URL($task,$url,method) $method
    set URL($task,$url,host) $host
    set URL($task,$url,hostport) $hostport
    set URL($task,$url,path) $path
    set URL($task,$url,state) head
    set URL($task,$url,buf) {}

    if {[string compare $path /robots.txt]} {
	set ok 1
	if {![info exists URL($hostport,robots)]} {
	    dbgmsg 5 "RobotGetUrl(): reading robots.txt for host $hostport"
	    if {[RobotFileExist $task visited $hostport /robots.txt]} {
		set inf [RobotFileOpen $task visited $hostport /robots.txt r]
		set buf [read $inf 32768]
		close $inf
	    } else {
		set buf "User-agent: *\nAllow: /\n"
	    }
	    RobotsTxt0 $task URL($hostport,robots) $buf
	}
	if {[info exists URL($hostport,robots)]} {
	    foreach l $URL($hostport,robots) {
		if {[string first [lindex $l 1] $path] == 0} {
		    set ok [lindex $l 0]
		    break
		}
	    }
	}
	if {!$ok} {
	    dbgmsg 5 "RobotGetUrl(): skipped due to robots.txt"
	    return -1
	}
    }
    if [catch {set sock [socket -async $host $port]}] {
        return -1
    }
    RobotConnect $task $url $sock

    return 0
}

proc loadlib {} {
    global libdir

    if {![llength [info commands htmlSwitch]]} {
        if {[info exists env(tclrobot_lib)]} {
	    set d $env(tclrobot_lib)
        } else {
            if { $libdir > "" } {
                set d $libdir
            } else {
	        set d .
            }
        }
        set e [info sharedlibextension]
        dbgmsg 5 "loadlib(): Loading libtclrobot$e"
        if {[catch {load libtclrobot$e}]} {
            dbgmsg 5 "loadlib(): Trying loading at  $d directly"
	    load $d/libtclrobot$e
        }
        dbgmsg 5 "loadlib(): Loaded libtclrobot$e"
    }
}

#set agent "zmbot/0.2"
set agent "tkl-web-harvester"
if {![catch {set os [exec uname -s -r]}]} {
    set agent "$agent ($os)"
}

dbgmsg 10 "agent: $agent"
logmsg "$agent started"

proc bgerror {msg} {
    global errorInfo errorCode
    logmsg "errorCode $errorCode"
    logmsg "Detailed error information:"
    logmsg $errorInfo
    exit 1
}

# Rules: allow, deny, url

proc checkrule {task type this} {
    global control
    global debuglevel

    set default_ret 1

    if {$debuglevel > 3} {
        dbgmsg 5 "checkrule(): CHECKRULE $type $this"
    }
    if {[info exist control($task,alrules)]} {
        set myrules $control($task,alrules)
    }
    # put these rules at end.
    lappend myrules [list allow mime text/html]
    lappend myrules [list deny mime *]
    # myrules are now defined always..
    foreach l $myrules {
        if {$debuglevel > 3} {
            dbgmsg 5 "consider $l"
        }
        # consider type
        if {[lindex $l 1] != $type} continue
        # consider mask (! negates)
        set masks [lindex $l 2]
	set ok 0
	set default_ret 0
	foreach mask $masks {	
	    if {$debuglevel > 4} {
		dbgmsg 5 "checkrule(): consider single mask $mask"
	    }
	    if {[string index $mask 0] == "!"} {
		set mask [string range $mask 1 end]
		if {[string match $mask $this]}  continue
	    } else {
		if {![string match $mask $this]} continue
	    }
	    set ok 1
	}
	if {$debuglevel > 4} {
	    dbgmsg 5 "checkrule(): ok = $ok"
	}
	if {!$ok} continue
	# OK, we have a match
	if {[lindex $l 0] == "allow"} {
	    if {$debuglevel > 3} {
		dbgmsg 5 "checkrule(): CHECKRULE MATCH OK"
	    }
	    return 1
	} else {
	    if {$debuglevel > 3} {
		dbgmsg 5 "checkrule(): CHECKFULE MATCH FAIL"
	    }
	    return 0
	}
    }
    if {$debuglevel > 3} {
        dbgmsg 5 "checkrule(): CHECKRULE MATCH DEFAULT $default_ret"
    }
    return $default_ret
}


proc url {href} {
    global debuglevel task
    dbgmsg 5 "url(): entered"

    if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
        if {![RobotFileExist $task visited $host $path]} {
            set outf [RobotFileOpen $task unvisited $host $path]
            RobotWriteRecord $outf href 0
            RobotFileClose $outf
        }
    }
    dbgmsg 5 "url(): finished"
}

proc deny {type stuff} {
    global control task

    lappend control($task,alrules) [list deny $type $stuff]
}

proc allow {type stuff} {
    global control task

    lappend control($task,alrules) [list allow $type $stuff]
}

proc debug {level} {
    global debuglevel

    set debuglevel $level
}

proc CreateTask {t} {
    global tasks task status control

    set task $t
    dbgmsg 5 "CreateTask(): task file $task"
    if {[info exists tasks]} {
        if {[lsearch -exact $tasks $t] >= 0} {
            return 0
        }
    }

    lappend tasks $t
    set status($t,unvisited) 0
    set status($t,visited) 0
    set status($t,bad) 0
    set status($t,raw) 0
    set status($t,active) 1
    set control($t,seq) 0
    set control($t,distance) 10
    set control($t,target) tmp
    set control($t,output) output
    set control($t,cr) 0
    return 1
}

# Little utility that ensures that at least one task is present (main).
proc CreateMainTask {} {
    global tasks
    if {![info exist tasks]} {
        CreateTask main
    }
}

# Parse options

set i 0
set l [llength $argv]

if {$l < 1} {
    puts {tkl-web-harvester: usage:}
    puts {tkl-web-harvester [-j jobs] [-p pid] [-T tmpdir] [-o logfile] }
    puts {                  [-i idle] [-c count] [-d domain] [-D spooldir] }
    puts {                  [-r rules] [-L libdir] [-v verbosity] [url ..]}

    exit 1
}

logmsg "tkl-web-harvester: started with pid [pid]"


while  {$i < $l} {
    set arg [lindex $argv $i]
    switch -glob -- $arg {
	-o* {
	    set fname [string range $arg 2 end]
	    if {![string length $fname]} {
		set fname [lindex $argv [incr i]]
	    }
	    logmsg "tkl-web-harvester: log file $fname"
	    if { [ catch {set loghandle [open $fname a]} ] } {
		logmsg "tkl-web-harvester: opening file $fname failed. Kill"
		exit 1
	    }
	}
	-p* {
	    set pidfname [string range $arg 2 end]
	    if {![string length $pidfname]} {
		set pidfname [lindex $argv [incr i]]
	    }
	    logmsg "tkl-web-harvester: pid file $pidfname"
	    if {[file exists $pidfname]} {
		if { [ catch { set pf [open $pidfname r] } ] } {
		    logmsg "tkl-web-harvester: opening file $pidfname failed. Kill"
		    exit 1
		}
		gets $pf oldpid
		close $pf
		logmsg "tkl-web-harvester: pid file already exist with pid=$oldpid"
		if {[file isdirectory /proc/$oldpid]} {
		    logmsg "tkl-web-harvester:  apparently running. Exiting."
		    exit 1
		}
	    }
	    if { [ catch { set pf [open $pidfname w ] } ] } {
		logmsg "tkl-web-harvester: opening pid file failed. Kill"
		exit 1
	    }
	    puts $pf [pid]
	    close $pf
	}
	-T* {
	    set tmpdir [string range $arg 2 end]
	    if {![string length $tmpdir]} {
		set tmpdir [lindex $argv [incr i]]
	    }
	    logmsg "tkl-web-harvester: tmp dir $tmpdir"
	    if { [ catch { set tf [open $tmpdir/test w] } ] } {
		logmsg "tkl-web-harvester: opening tmp dir failed. Kill"
		file delete "$tmpdir/test"
		exit 1
	    }
	    close $tf
	    file delete "$tmpdir/test"
	    set control(tmpdir) $tmpdir
	}
	-L* {
	    set libdir [string range $arg 2 end]
	    if {![string length $libdir]} {
		set libdir [lindex $argv [incr i]]
	    }
	    logmsg "tkl-web-harvester: lib dir $libdir"
	}
        -t* {
	    set t [string range $arg 2 end]
	    if {![string length $t]} {
		set t [lindex $argv [incr i]]
	    }
            CreateTask $t
	}
        -D* {
	    set dir [string range $arg 2 end]
	    if {![string length $dir]} {
		set dir [lindex $argv [incr i]]
	    }
	    logmsg "tkl-web-harvester: spool dir $dir"
	    if { [ catch { set sf [open $dir/test w] } ] } {
		logmsg "tkl-web-harvester: opening spool dir failed. Kill"
		file delete "$dir/test"
		exit 1
	    }
	    close $sf
	    file delete "$dir/test"
	    set control(tmpdir) $tmpdir
            lappend daemon_dir $dir
        }
	-j* {
	    set robotsMax [string range $arg 2 end]
	    if {![string length $robotsMax]} {
		set robotsMax [lindex $argv [incr i]]
	    }
	}
	-c* {
            CreateMainTask
	    set control($task,distance) [string range $arg 2 end]
	    if {![string length $control($task,distance)]} {
		set control($task,distance) [lindex $argv [incr i]]
	    }
	}
	-d* {
            CreateMainTask
	    set dom [string range $arg 2 end]
	    if {![string length $dom]} {
		set dom [lindex $argv [incr i]]
	    }
	    lappend control($task,domains) $dom
	}
	-i* {
	    set idletime [string range $arg 2 end]
	    if {![string length $idletime]} {
		set idletime [lindex $argv [incr i]]
	    }
	    # convert from seconds to miliseconds
	    #set idletime [expr [$idletime * 1000 ]] 
	}
	-v* {
	    set debuglevel [string range $arg 2 end]
	    if {![string length $debuglevel]} {
		set debuglevel [lindex $argv [incr i]]
	    }
	}
        -l* {
            CreateMainTask
	    set acceptLanguage [string range $arg 2 end]
	    if {![string length $acceptLanguage]} {
		set acceptLanguage [lindex $argv [incr i]]
 	    }
	}
        -r* {
            CreateMainTask
	    set rfile [string range $arg 2 end]
	    if {![string length $rfile]} {
		set rfile [lindex $argv [incr i]]
 	    }
            catch {unset maxdistance}
            source $rfile
            if {[info exists maxdistance]} {
                set control($task,distance) $maxdistance
            }
        }
	default {
            CreateMainTask
	    set href $arg
	    dbgmsg 10 "in default: arg= $arg !!!"
        loadlib
	    if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
		if {![RobotFileExist $task visited $host $path]} {
		    set outf [RobotFileOpen $task unvisited $host $path]
		    RobotWriteRecord $outf href 0
		    RobotFileClose $outf
		}
	    }
	}
    }
    incr i
}


logmsg "tkl-web-harvester: pid [pid] started"
dbgmsg 5 "tkl-web-harvester: parsed args, now loading"
loadlib

if {![info exist robotsMax]} {
    set robotsMax 5
}

if {[info exist daemon_dir]} {
    logmsg "tkl-web-harvester: daemon mode running"
    RobotDaemonLoop
} else {
    foreach t $tasks {
	logmsg "tkl-web-harvester: task $t"
	logmsg "tkl-web-harvester: max distance=$control($t,distance)"
	if {[info exists control($t,domains)]} {
	    logmsg "tkl-web-harvester: domains=$control($t,domains)"
	}
    }
    logmsg "tkl-web-harvester: max jobs=$robotsMax"
    
    foreach t $tasks {
	RobotStart $t
    }
    
    while {$robotsRunning} {
	vwait robotsRunning
    }
    
    if {[info exists tasks]} {
	foreach t $tasks {
	    set statusfile [open $t/status w]
	    puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
	    close $statusfile
	}
    }
}

