#!/usr/bin/env sh
##
## This script launches the heritrix crawler.
##
## Optional environment variables
##
## JAVA_HOME        Point at a JDK install to use.
## 
## HERITRIX_HOME    Pointer to your heritrix install.  If not present, we 
##                  make an educated guess based of position relative to this
##                  script.
##
## HERITRIX_OUT     Pathname to the Heritrix log file written when run in
##                  daemon mode.
##                  Default setting is $HERITRIX_HOME/heritrix_out.log
##
## JAVA_OPTS        Java runtime options.  Default setting is '-Xmx256m'.
##
## FOREGROUND       Set to any value -- e.g. 'true' -- if you want to run 
##                  heritrix in foreground (Used by build system when it runs
##                  selftest to see if completed successfully or not).
##
## JMX_OPTS         Default is to startup the JVM JMX administration 
##                  on port 8849 if the JVM is SUN JVM 1.5.  This allows JMX
##                  administration of Heritrix.  If the JVM is other than the
##                  SUN JDK 1.5, the arguments are ignored. If you do not want
##                  to start the JVM JXM administration server on the SUN JDK
##                  1.5, set this variable to empty string.
##
## JMX_PORT         Port you'd like the JVM JMX administration server to run
##                  on. Default is 8849.
##
## JMX_OFF          Set to a non-empty string to disable JMX (and JMX setup of
##                  password file, etc.)
##

# Resolve links - $0 may be a softlink
PRG="$0"
while [ -h "$PRG" ]; do
  ls=`ls -ld "$PRG"`
  link=`expr "$ls" : '.*-> \(.*\)$'`
  if expr "$link" : '.*/.*' > /dev/null; then
    PRG="$link"
  else
    PRG=`dirname "$PRG"`/"$link"
  fi
done
PRGDIR=`dirname "$PRG"`

# Read local heritrix properties if any.
if [ -f $HOME/.heritrixrc ]
then 
  . $HOME/.heritrixrc
fi

# Set HERITRIX_HOME.
if [ -z "$HERITRIX_HOME" ]
then
    HERITRIX_HOME=`cd "$PRGDIR/.." ; pwd`
fi

# Find JAVA_HOME.
if [ -z "$JAVA_HOME" ]
then
  JAVA=`which java`
  if [ -z "$JAVA" ] 
  then
    echo "Cannot find JAVA. Please set JAVA_HOME or your PATH."
    exit 1
  fi
  JAVA_BINDIR=`dirname $JAVA`
  JAVA_HOME=$JAVA_BINDIR/..
fi

if [ -z "$JAVACMD" ] 
then 
   # It may be defined in env - including flags!!
   # See '[ 1482761 ] BDB Adler32 gc-lock OOME risk' for why we include the
   # 'je.disable.java.adler32'.
   JAVACMD="$JAVA_HOME/bin/java -Dje.disable.java.adler32=true"
fi

# Ignore previous classpath.  Build one that contains heritrix jar and content
# of the lib directory into the variable CP.
for jar in `ls $HERITRIX_HOME/lib/*.jar $HERITRIX_HOME/*.jar`
do
    CP=${CP}:${jar}
done

# cygwin path translation
if expr `uname` : 'CYGWIN*' > /dev/null; then
    CP=`cygpath -p -w "$CP"`
    HERITRIX_HOME=`cygpath -p -w "$HERITRIX_HOME"`
fi

# Make sure of java opts.
if [ -z "$JAVA_OPTS" ]
then
  JAVA_OPTS=" -Xmx256m"
fi

if [ -z "${JMX_OFF}" ]
then
  if [ -z "${JMX_PORT}" ]
  then
      JMX_PORT=8849
  fi

  if [ -z "$JMX_OPTS" ]
  then
    JMX_OPTS="-Dcom.sun.management.jmxremote.port=${JMX_PORT} \
      -Dcom.sun.management.jmxremote.ssl=false \
      -Dcom.sun.management.jmxremote.password.file=${HERITRIX_HOME}/jmxremote.password"

    # Copy into place a jmxremote password file that uses the heritrix password
    # interpolated (First need to find the current password if one supplied on
    # command-line, else use whats in heritrix.properties as default).
    # Need to make it so its only readable by user else jconsole won't use it.
    JMX_PASSWORD=`echo "$@" |sed -n -e 's/.*--admin=[^:]*:\([^ ]*\).*/\1/p' \
        -e 's/.*-a *[^:]*:\([^ ]*\).*/\1/p'`
    if [ -z "$JMX_PASSWORD" ]
    then
      JMX_PASSWORD=`sed -n -e 's/heritrix.cmdline.admin[ ]*=[^:]*:\(.*\)/\1/p' \
      ${HERITRIX_HOME}/conf/heritrix.properties`
    fi
    JMX_PWORD_FILE="${HERITRIX_HOME}/jmxremote.password"
    if [ -f "${JMX_PWORD_FILE}" ]
      then
         rm -f "${JMX_PWORD_FILE}"
    fi
    sed -e "s/@PASSWORD@/${JMX_PASSWORD}/" \
      "${HERITRIX_HOME}/conf/jmxremote.password.template" > "${JMX_PWORD_FILE}"
    chmod 600 "${JMX_PWORD_FILE}"
    fi
fi

# Main heritrix class.
if [ -z "$CLASS_MAIN" ]
then
  CLASS_MAIN='org.archive.crawler.Heritrix'
fi

# heritrix_dmesg.log contains startup output from the crawler main class. 
# As soon as content appears in this log, this shell script prints the 
# successful (or failed) startup content and moves off waiting on heritrix
# startup. This technique is done so we can show on the console startup 
# messages emitted by java subsequent to the redirect of stdout and stderr.
startMessage="${HERITRIX_HOME}/heritrix_dmesg.log"

# Remove any file that may have been left over from previous starts.
if [ -f $startMessage ]
then
    rm -f $startmessage
fi
# Run heritrix as daemon.  Redirect stdout and stderr to a file.
# Print start message with date, java version, java opts, ulimit, and uname.
if [ -z "$HERITRIX_OUT" ]
then
  HERITRIX_OUT=${HERITRIX_HOME}/heritrix_out.log
fi
stdouterrlog=${HERITRIX_OUT}
echo "`date` Starting heritrix" >> $stdouterrlog
uname -a >> $stdouterrlog 2>&1
${JAVACMD} ${JAVA_OPTS} -version >> $stdouterrlog 2>&1
echo "JAVA_OPTS=${JAVA_OPTS}" >> $stdouterrlog
ulimit -a >> $stdouterrlog 2>&1

# If FOREGROUND is set, run heritrix in foreground.
if [ -n "$FOREGROUND" ]
then
    CLASSPATH=${CP} $JAVACMD -Dheritrix.home=${HERITRIX_HOME} \
        -Djava.protocol.handler.pkgs=org.archive.net \
        -Dheritrix.out=${HERITRIX_OUT} ${JAVA_OPTS} ${JMX_OPTS} \
        $CLASS_MAIN $@
else
    CLASSPATH=${CP} nohup $JAVACMD -Dheritrix.home=${HERITRIX_HOME} \
        -Djava.protocol.handler.pkgs=org.archive.net \
        -Dheritrix.out=${HERITRIX_OUT} ${JAVA_OPTS} ${JMX_OPTS} \
        $CLASS_MAIN $@ >> ${stdouterrlog} 2>&1 &

    # Wait for content in the heritrix_dmesg.log file.
    echo -n "`date` Starting heritrix"
    while true 
    do
        sleep 1
        if [ -s $startMessage ]
        then
            echo
            cat $startMessage
            rm -f $startMessage
            break
        fi
        echo -n '.'
    done
fi
