1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler;
26
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.FileOutputStream;
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.io.PrintStream;
34 import java.io.PrintWriter;
35 import java.net.HttpURLConnection;
36 import java.net.InetAddress;
37 import java.net.URL;
38 import java.net.URLConnection;
39 import java.net.UnknownHostException;
40 import java.util.ArrayList;
41 import java.util.Arrays;
42 import java.util.Collection;
43 import java.util.Collections;
44 import java.util.Enumeration;
45 import java.util.Hashtable;
46 import java.util.Iterator;
47 import java.util.List;
48 import java.util.Map;
49 import java.util.Properties;
50 import java.util.StringTokenizer;
51 import java.util.TimeZone;
52 import java.util.Vector;
53 import java.util.logging.Level;
54 import java.util.logging.LogManager;
55 import java.util.logging.Logger;
56
57 import javax.management.Attribute;
58 import javax.management.AttributeList;
59 import javax.management.AttributeNotFoundException;
60 import javax.management.DynamicMBean;
61 import javax.management.InstanceAlreadyExistsException;
62 import javax.management.InstanceNotFoundException;
63 import javax.management.InvalidAttributeValueException;
64 import javax.management.MBeanInfo;
65 import javax.management.MBeanNotificationInfo;
66 import javax.management.MBeanOperationInfo;
67 import javax.management.MBeanRegistration;
68 import javax.management.MBeanRegistrationException;
69 import javax.management.MBeanServer;
70 import javax.management.MBeanServerFactory;
71 import javax.management.MalformedObjectNameException;
72 import javax.management.NotCompliantMBeanException;
73 import javax.management.ObjectName;
74 import javax.management.ReflectionException;
75 import javax.management.RuntimeOperationsException;
76 import javax.management.openmbean.CompositeData;
77 import javax.management.openmbean.CompositeDataSupport;
78 import javax.management.openmbean.CompositeType;
79 import javax.management.openmbean.OpenDataException;
80 import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
81 import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
82 import javax.management.openmbean.OpenMBeanInfoSupport;
83 import javax.management.openmbean.OpenMBeanOperationInfoSupport;
84 import javax.management.openmbean.OpenMBeanParameterInfo;
85 import javax.management.openmbean.OpenMBeanParameterInfoSupport;
86 import javax.management.openmbean.OpenType;
87 import javax.management.openmbean.SimpleType;
88 import javax.management.openmbean.TabularData;
89 import javax.management.openmbean.TabularDataSupport;
90 import javax.management.openmbean.TabularType;
91 import javax.naming.CompoundName;
92 import javax.naming.Context;
93 import javax.naming.NameNotFoundException;
94 import javax.naming.NamingException;
95 import javax.naming.NoInitialContextException;
96
97 import org.apache.commons.cli.Option;
98 import org.archive.crawler.admin.CrawlJob;
99 import org.archive.crawler.admin.CrawlJobErrorHandler;
100 import org.archive.crawler.admin.CrawlJobHandler;
101 import org.archive.crawler.datamodel.CredentialStore;
102 import org.archive.crawler.datamodel.credential.Credential;
103 import org.archive.crawler.event.CrawlStatusListener;
104 import org.archive.crawler.framework.AlertManager;
105 import org.archive.crawler.framework.CrawlController;
106 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
107 import org.archive.crawler.framework.exceptions.InitializationException;
108 import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
109 import org.archive.crawler.settings.XMLSettingsHandler;
110 import org.archive.io.SinkHandler;
111 import org.archive.io.SinkHandlerLogRecord;
112 import org.archive.net.UURI;
113 import org.archive.util.FileUtils;
114 import org.archive.util.IoUtils;
115 import org.archive.util.JmxUtils;
116 import org.archive.util.JndiUtils;
117 import org.archive.util.PropertyUtils;
118 import org.archive.util.TextUtils;
119
120 import sun.net.www.protocol.file.FileURLConnection;
121
122
123 /***
124 * Main class for Heritrix crawler.
125 *
126 * Heritrix is usually launched by a shell script that backgrounds heritrix
127 * that redirects all stdout and stderr emitted by heritrix to a log file. So
128 * that startup messages emitted subsequent to the redirection of stdout and
129 * stderr show on the console, this class prints usage or startup output
130 * such as where the web UI can be found, etc., to a STARTLOG that the shell
131 * script is waiting on. As soon as the shell script sees output in this file,
132 * it prints its content and breaks out of its wait.
133 * See ${HERITRIX_HOME}/bin/heritrix.
134 *
135 * <p>Heritrix can also be embedded or launched by webapp initialization or
136 * by JMX bootstrapping. So far I count 4 methods of instantiation:
137 * <ol>
138 * <li>From this classes main -- the method usually used;</li>
139 * <li>From the Heritrix UI (The local-instances.jsp) page;</li>
140 * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
141 * <li>A container such as tomcat or jboss.</li>
142 * </ol>
143 *
144 * @author gojomo
145 * @author Kristinn Sigurdsson
146 * @author Stack
147 */
148 public class Heritrix implements DynamicMBean, MBeanRegistration {
149 /***
150 * Heritrix logging instance.
151 */
152 private static final Logger logger =
153 Logger.getLogger(Heritrix.class.getName());
154
155 private static final File TMPDIR =
156 new File(System.getProperty("java.io.tmpdir", "/tmp"));
157
158 /***
159 * Name of the heritrix properties file.
160 */
161 private static final String PROPERTIES = "heritrix.properties";
162
163 /***
164 * Name of the key to use specifying alternate heritrix properties on
165 * command line.
166 */
167 private static final String PROPERTIES_KEY = PROPERTIES;
168
169 /***
170 * Prefix used on our properties we'll add to the System.properties list.
171 */
172 private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
173
174 /***
175 * Prefix used on other properties we'll add to the System.properties
176 * list (after stripping this prefix).
177 */
178 private static final String SYSTEM_PREFIX = "system.";
179
180 /***
181 * Instance of web server if one was started.
182 */
183 private static SimpleHttpServer httpServer = null;
184
185 /***
186 * CrawlJob handler. Manages multiple crawl jobs at runtime.
187 */
188 private CrawlJobHandler jobHandler = null;
189
190 /***
191 * Heritrix start log file.
192 *
193 * This file contains standard out produced by this main class for startup
194 * only. Used by heritrix shell script. Name here MUST match that in the
195 * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
196 * wrapper has on this here java heritrix.
197 */
198 private static final String STARTLOG = "heritrix_dmesg.log";
199
200 /***
201 * Default encoding.
202 *
203 * Used for content when fetching if none specified.
204 */
205 public static final String DEFAULT_ENCODING = "ISO-8859-1";
206
207 /***
208 * Heritrix stderr/stdout log file.
209 *
210 * This file should have nothing in it except messages over which we have
211 * no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
212 * startup script directs stderr/stdout here. This is an INTERDEPENDENCY
213 * this program has with the wrapper shell script. Shell can actually
214 * pass us an alternate to use for this file.
215 */
216 private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
217
218 /***
219 * Where to write this classes startup output.
220 *
221 * This out should only be used if Heritrix is being run from the
222 * command-line.
223 */
224 private static PrintWriter out = null;
225
226 /***
227 * The org.archive package
228 */
229 private static final String ARCHIVE_PACKAGE = "org.archive.";
230
231 /***
232 * The crawler package.
233 */
234 private static final String CRAWLER_PACKAGE = Heritrix.class.getName().
235 substring(0, Heritrix.class.getName().lastIndexOf('.'));
236
237 /***
238 * The root context for a webapp.
239 */
240 private static final String ROOT_CONTEXT = "/";
241
242 /***
243 * Set to true if application is started from command line.
244 */
245 private static boolean commandLine = false;
246
247 /***
248 * True if container initialization has been run.
249 */
250 private static boolean containerInitialized = false;
251
252 /***
253 * True if properties have been loaded.
254 */
255 private static boolean propertiesLoaded = false;
256
257 private static final String JAR_SUFFIX = ".jar";
258
259 private AlertManager alertManager;
260
261 /***
262 * The context of the GUI webapp. Default is root.
263 */
264 private static String adminContext = ROOT_CONTEXT;
265
266 /***
267 * True if we're to put up a GUI.
268 * Cmdline processing can override.
269 */
270 private static boolean gui =
271 !PropertyUtils.getBooleanProperty("heritrix.cmdline.nowui");
272
273 /***
274 * Port to put the GUI up on.
275 * Cmdline processing can override.
276 */
277 private static int guiPort = SimpleHttpServer.DEFAULT_PORT;
278
279
280 /***
281 * A collection containing only localhost. Used as default value
282 * for guiHosts, and passed to SimpleHttpServer when doing selftest.
283 */
284 final private static Collection<String> LOCALHOST_ONLY =
285 Collections.unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" }));
286
287
288 /***
289 * Hosts to bind the GUI webserver to.
290 * By default, only contans localhost.
291 * Set to an empty collection to indicate that all available network
292 * interfaces should be used for the webserver.
293 */
294 private static Collection<String> guiHosts = LOCALHOST_ONLY;
295
296
297 /***
298 * Web UI server, realm, context name.
299 */
300 private static String ADMIN = "admin";
301
302
303 /***
304 * The MBean server we're registered with (May be null).
305 */
306 private MBeanServer mbeanServer = null;
307
308 /***
309 * MBean name we were registered as.
310 */
311 private ObjectName mbeanName = null;
312
313 /***
314 * Keep reference to all instances of Heritrix.
315 * Used by the UI to figure which of the local Heritrice it should
316 * be going against and to figure what to shutdown on the way out (If
317 * there was always a JMX Agent, we wouldn't need to keep this list. We
318 * could always ask the JMX Agent for all instances. UPDATE: True we could
319 * always ask the JMX Agent but we might keep around this local reference
320 * because it will allow faster, less awkward -- think of marshalling the args
321 * for JMX invoke operation -- access to local Heritrix instances. A new
322 * usage for this instances Map is in CrawlJob#preRegister to find the hosting
323 * Heritrix instance).
324 */
325 private static Map<String,Heritrix> instances
326 = new Hashtable<String,Heritrix>();
327
328 private OpenMBeanInfoSupport openMBeanInfo;
329 private final static String STATUS_ATTR = "Status";
330 private final static String VERSION_ATTR = "Version";
331 private final static String ISRUNNING_ATTR = "IsRunning";
332 private final static String ISCRAWLING_ATTR = "IsCrawling";
333 private final static String ALERTCOUNT_ATTR = "AlertCount";
334 private final static String NEWALERTCOUNT_ATTR = "NewAlertCount";
335 private final static String CURRENTJOB_ATTR = "CurrentJob";
336 private final static List ATTRIBUTE_LIST;
337 static {
338 ATTRIBUTE_LIST = Arrays.asList(new String [] {STATUS_ATTR,
339 VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR,
340 ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR});
341 }
342
343 private final static String START_OPER = "start";
344 private final static String STOP_OPER = "stop";
345 private final static String DESTROY_OPER = "destroy";
346 private final static String INTERRUPT_OPER = "interrupt";
347 private final static String START_CRAWLING_OPER = "startCrawling";
348 private final static String STOP_CRAWLING_OPER = "stopCrawling";
349 private final static String ADD_CRAWL_JOB_OPER = "addJob";
350 private final static String TERMINATE_CRAWL_JOB_OPER =
351 "terminateCurrentJob";
352 private final static String DELETE_CRAWL_JOB_OPER = "deleteJob";
353 private final static String ALERT_OPER = "alert";
354 private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";
355 private final static String PENDING_JOBS_OPER = "pendingJobs";
356 private final static String COMPLETED_JOBS_OPER = "completedJobs";
357 private final static String CRAWLEND_REPORT_OPER = "crawlendReport";
358 private final static String SHUTDOWN_OPER = "shutdown";
359 private final static String LOG_OPER = "log";
360 private final static String REBIND_JNDI_OPER = "rebindJNDI";
361 private final static List OPERATION_LIST;
362 static {
363 OPERATION_LIST = Arrays.asList(new String [] {START_OPER, STOP_OPER,
364 INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER,
365 ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER,
366 DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER,
367 COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER,
368 LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,
369 REBIND_JNDI_OPER});
370 }
371 private CompositeType jobCompositeType = null;
372 private TabularType jobsTabularType = null;
373 private static final String [] JOB_KEYS =
374 new String [] {"uid", "name", "status"};
375
376 private static String adminUsername;
377
378 private static String adminPassword;
379
380 /***
381 * Constructor.
382 * Does not register the created instance with JMX. Assumed this
383 * constructor is used by such as JMX agent creating an instance of
384 * Heritrix at the commmand of a remote client (In this case Heritrix will
385 * be registered by the invoking agent).
386 * @throws IOException
387 */
388 public Heritrix() throws IOException {
389 this(null, false);
390 }
391
392 public Heritrix(final boolean jmxregister) throws IOException {
393 this(null, jmxregister);
394 }
395
396 /***
397 * Constructor.
398 * @param name If null, we bring up the default Heritrix instance.
399 * @param jmxregister True if we are to register this instance with JMX
400 * agent.
401 * @throws IOException
402 */
403 public Heritrix(final String name, final boolean jmxregister)
404 throws IOException {
405 this(name, jmxregister, new CrawlJobHandler(getJobsdir()));
406 }
407
408 /***
409 * Constructor.
410 * @param name If null, we bring up the default Heritrix instance.
411 * @param jmxregister True if we are to register this instance with JMX
412 * agent.
413 * @param cjh CrawlJobHandler to use.
414 * @throws IOException
415 */
416 public Heritrix(final String name, final boolean jmxregister,
417 final CrawlJobHandler cjh)
418 throws IOException {
419 super();
420 containerInitialization();
421 this.jobHandler = cjh;
422 this.openMBeanInfo = buildMBeanInfo();
423
424
425
426 final SinkHandler sinkHandler = SinkHandler.getInstance();
427 if (sinkHandler == null) {
428 throw new NullPointerException("SinkHandler not found.");
429 }
430
431 this.alertManager = new AlertManager() {
432 public void add(SinkHandlerLogRecord record) {
433 sinkHandler.publish(record);
434 }
435
436 public Vector getAll() {
437 return sinkHandler.getAll();
438 }
439
440 public Vector getNewAll() {
441 return sinkHandler.getAllUnread();
442 }
443
444 public SinkHandlerLogRecord get(String alertID) {
445 return sinkHandler.get(Long.parseLong(alertID));
446 }
447
448 public int getCount() {
449 return sinkHandler.getCount();
450 }
451
452 public int getNewCount() {
453 return sinkHandler.getUnreadCount();
454 }
455
456 public void remove(String alertID) {
457 sinkHandler.remove(Long.parseLong(alertID));
458 }
459
460 public void read(String alertID) {
461 sinkHandler.read(Long.parseLong(alertID));
462 }
463 };
464
465 try {
466 Heritrix.registerHeritrix(this, name, jmxregister);
467 } catch (InstanceAlreadyExistsException e) {
468 throw new RuntimeException(e);
469 } catch (MBeanRegistrationException e) {
470 throw new RuntimeException(e);
471 } catch (NotCompliantMBeanException e) {
472 throw new RuntimeException(e);
473 } catch (MalformedObjectNameException e) {
474 throw new RuntimeException(e);
475 }
476 }
477
478 /***
479 * Run setup tasks for this 'container'. Idempotent.
480 *
481 * @throws IOException
482 */
483 protected static void containerInitialization() throws IOException {
484 if (Heritrix.containerInitialized) {
485 return;
486 }
487 Heritrix.containerInitialized = true;
488
489
490
491
492 Heritrix.loadProperties();
493 Heritrix.patchLogging();
494 Heritrix.configureTrustStore();
495
496
497
498 Runtime.getRuntime().addShutdownHook(
499 Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook"));
500
501
502 try {
503 registerContainerJndi();
504 } catch (Exception e) {
505 logger.log(Level.WARNING, "Failed jndi container registration.", e);
506 }
507 }
508
509 /***
510 * Do inverse of construction. Used by anyone who does a 'new Heritrix' when
511 * they want to cleanup the instance.
512 * Of note, there may be Heritrix threads still hanging around after the
513 * call to destroy completes. They'll eventually go down after they've
514 * finished their cleanup routines. In particular, if you are watching
515 * Heritrix via JMX, you can see the Heritrix instance JMX bean unregister
516 * ahead of the CrawlJob JMX bean that its hosting.
517 */
518 public void destroy() {
519 stop();
520 try {
521 Heritrix.unregisterHeritrix(this);
522 } catch (InstanceNotFoundException e) {
523 e.printStackTrace();
524 } catch (MBeanRegistrationException e) {
525 e.printStackTrace();
526 } catch (NullPointerException e) {
527 e.printStackTrace();
528 }
529 this.jobHandler = null;
530 this.openMBeanInfo = null;
531 }
532
533 /***
534 * Launch program.
535 * Optionally will launch a web server to host UI. Will also register
536 * Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM
537 * Agent).
538 *
539 * @param args Command line arguments.
540 * @throws Exception
541 */
542 public static void main(String[] args)
543 throws Exception {
544 Heritrix.commandLine = true;
545
546
547
548 TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
549
550 File startLog = new File(getHeritrixHome(), STARTLOG);
551 Heritrix.out = new PrintWriter(isDevelopment()?
552 System.out: new PrintStream(new FileOutputStream(startLog)));
553
554 try {
555 containerInitialization();
556 String status = doCmdLineArgs(args);
557 if (status != null) {
558 Heritrix.out.println(status);
559 }
560 }
561
562 catch(Exception e) {
563
564 e.printStackTrace(Heritrix.out);
565 throw e;
566 }
567
568 finally {
569
570
571
572 if (!isDevelopment()) {
573 if (Heritrix.out != null) {
574 Heritrix.out.close();
575 }
576 System.out.println("Heritrix version: " +
577 Heritrix.getVersion());
578 } else {
579 if (Heritrix.out != null) {
580 Heritrix.out.flush();
581 }
582 }
583 }
584 }
585
586 protected static String doCmdLineArgs(final String [] args)
587 throws Exception {
588
589 String tmpStr = PropertyUtils.
590 getPropertyOrNull("heritrix.context");
591 if (tmpStr != null) {
592 Heritrix.adminContext = tmpStr;
593 }
594 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port");
595 if (tmpStr != null) {
596 Heritrix.guiPort = Integer.parseInt(tmpStr);
597 }
598 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin");
599 String adminLoginPassword = (tmpStr == null)? "": tmpStr;
600 String crawlOrderFile =
601 PropertyUtils.getPropertyOrNull("heritrix.cmdline.order");
602 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run");
603 boolean runMode =
604 PropertyUtils.getBooleanProperty("heritrix.cmdline.run");
605 boolean selfTest = false;
606 String selfTestName = null;
607 CommandLineParser clp = new CommandLineParser(args, Heritrix.out,
608 Heritrix.getVersion());
609 List arguments = clp.getCommandLineArguments();
610 Option [] options = clp.getCommandLineOptions();
611
612
613
614 if (arguments.size() > 1) {
615 clp.usage(1);
616 } else if (arguments.size() == 1) {
617 crawlOrderFile = (String)arguments.get(0);
618 if (!(new File(crawlOrderFile).exists())) {
619 clp.usage("ORDER.XML <" + crawlOrderFile +
620 "> specified does not exist.", 1);
621 }
622
623 if (crawlOrderFile.length() > 4 &&
624 !crawlOrderFile.substring(crawlOrderFile.length() - 4).
625 equalsIgnoreCase(".xml")) {
626 clp.usage("ORDER.XML <" + crawlOrderFile +
627 "> does not have required '.xml' suffix.", 1);
628 }
629 }
630
631
632 for (int i = 0; i < options.length; i++) {
633 switch(options[i].getId()) {
634 case 'h':
635 clp.usage();
636 break;
637
638 case 'a':
639 adminLoginPassword = options[i].getValue();
640 break;
641
642 case 'n':
643 if (crawlOrderFile == null) {
644 clp.usage("You must specify an ORDER_FILE with" +
645 " '--nowui' option.", 1);
646 }
647 Heritrix.gui = false;
648 break;
649
650 case 'b':
651 Heritrix.guiHosts = parseHosts(options[i].getValue());
652 break;
653
654 case 'p':
655 try {
656 Heritrix.guiPort =
657 Integer.parseInt(options[i].getValue());
658 } catch (NumberFormatException e) {
659 clp.usage("Failed parse of port number: " +
660 options[i].getValue(), 1);
661 }
662 if (Heritrix.guiPort <= 0) {
663 clp.usage("Nonsensical port number: " +
664 options[i].getValue(), 1);
665 }
666 break;
667
668 case 'r':
669 runMode = true;
670 break;
671
672 case 's':
673 selfTestName = options[i].getValue();
674 selfTest = true;
675 break;
676
677 default:
678 assert false: options[i].getId();
679 }
680 }
681
682
683 String status = null;
684 if (selfTest) {
685
686
687
688 for (int i = 0; i < options.length; i++) {
689 if (options[i].getId() != 'p' && options[i].getId() != 's') {
690 clp.usage(1);
691 }
692 }
693
694 if (arguments.size() > 0) {
695
696 clp.usage(1);
697 }
698 status = selftest(selfTestName, Heritrix.guiPort);
699 } else {
700 if (!isValidLoginPasswordString(adminLoginPassword)) {
701 clp.usage("Invalid admin login:password value, or none "
702 + "specified. ", 1);
703 }
704
705 if (!Heritrix.gui) {
706 if (options.length > 1) {
707
708
709
710 clp.usage(1);
711 }
712 Heritrix h = new Heritrix(true);
713 status = h.doOneCrawl(crawlOrderFile);
714 } else {
715 status = startEmbeddedWebserver(
716 Heritrix.guiHosts, Heritrix.guiPort,
717 adminLoginPassword);
718 Heritrix h = new Heritrix(true);
719
720 String tmp = h.launch(crawlOrderFile, runMode);
721 if (tmp != null) {
722 status += ('\n' + tmp);
723 }
724 }
725 }
726 return status;
727 }
728
729 /***
730 * @return The file we dump stdout and stderr into.
731 */
732 public static String getHeritrixOut() {
733 String tmp = System.getProperty("heritrix.out");
734 if (tmp == null || tmp.length() == 0) {
735 tmp = Heritrix.DEFAULT_HERITRIX_OUT;
736 }
737 return tmp;
738 }
739
740 /***
741 * Exploit <code>-Dheritrix.home</code> if available to us.
742 * Is current working dir if no heritrix.home property supplied.
743 * @return Heritrix home directory.
744 * @throws IOException
745 */
746 protected static File getHeritrixHome()
747 throws IOException {
748 File heritrixHome = null;
749 String home = System.getProperty("heritrix.home");
750 if (home != null && home.length() > 0) {
751 heritrixHome = new File(home);
752 if (!heritrixHome.exists()) {
753 throw new IOException("HERITRIX_HOME <" + home +
754 "> does not exist.");
755 }
756 } else {
757 heritrixHome = new File(new File("").getAbsolutePath());
758 }
759 return heritrixHome;
760 }
761
762 /***
763 * @return The directory into which we put jobs. If the system property
764 * 'heritrix.jobsdir' is set, we will use its value in place of the default
765 * 'jobs' directory in the current working directory.
766 * @throws IOException
767 */
768 public static File getJobsdir() throws IOException {
769 Heritrix.loadProperties();
770 String jobsdirStr = System.getProperty("heritrix.jobsdir", "jobs");
771 File jobsdir = new File(jobsdirStr);
772 return (jobsdir.isAbsolute())?
773 jobsdir:
774 new File(getHeritrixHome(), jobsdirStr);
775 }
776
777 /***
778 * Get and check for existence of expected subdir.
779 *
780 * If development flag set, then look for dir under src dir.
781 *
782 * @param subdirName Dir to look for.
783 * @return The extant subdir. Otherwise null if we're running
784 * in a webapp context where there is no conf directory available.
785 * @throws IOException if unable to find expected subdir.
786 */
787 protected static File getSubDir(String subdirName)
788 throws IOException {
789 return getSubDir(subdirName, true);
790 }
791
792 /***
793 * Get and optionally check for existence of subdir.
794 *
795 * If development flag set, then look for dir under src dir.
796 *
797 * @param subdirName Dir to look for.
798 * @param fail True if we are to fail if directory does not
799 * exist; false if we are to return false if the directory does not exist.
800 * @return The extant subdir. Otherwise null if we're running
801 * in a webapp context where there is no subdir directory available.
802 * @throws IOException if unable to find expected subdir.
803 */
804 protected static File getSubDir(String subdirName, boolean fail)
805 throws IOException {
806 String path = isDevelopment()?
807 "src" + File.separator + subdirName:
808 subdirName;
809 File dir = new File(getHeritrixHome(), path);
810 if (!dir.exists()) {
811 if (fail) {
812 throw new IOException("Cannot find subdir: " + subdirName);
813 }
814 dir = null;
815 }
816 return dir;
817 }
818
819 /***
820 * Test string is valid login/password string.
821 *
822 * A valid login/password string has the login and password compounded
823 * w/ a ':' delimiter.
824 *
825 * @param str String to test.
826 * @return True if valid password/login string.
827 */
828 protected static boolean isValidLoginPasswordString(String str) {
829 boolean isValid = false;
830 StringTokenizer tokenizer = new StringTokenizer(str, ":");
831 if (tokenizer.countTokens() == 2) {
832 String login = ((String)tokenizer.nextElement()).trim();
833 String password = ((String)tokenizer.nextElement()).trim();
834 if (login.length() > 0 && password.length() > 0) {
835 isValid = true;
836 }
837 }
838 return isValid;
839 }
840
841 protected static boolean isDevelopment() {
842 return System.getProperty("heritrix.development") != null;
843 }
844
845 /***
846 * Load the heritrix.properties file.
847 *
848 * Adds any property that starts with
849 * <code>HERITRIX_PROPERTIES_PREFIX</code>
850 * or <code>ARCHIVE_PACKAGE</code>
851 * into system properties (except logging '.level' directives).
852 * @return Loaded properties.
853 * @throws IOException
854 */
855 protected static Properties loadProperties()
856 throws IOException {
857 if (Heritrix.propertiesLoaded) {
858 return System.getProperties();
859 }
860 Heritrix.propertiesLoaded = true;
861
862 Properties properties = new Properties();
863 properties.load(getPropertiesInputStream());
864
865
866
867
868
869 for (Enumeration e = properties.keys(); e.hasMoreElements();) {
870 String key = ((String)e.nextElement()).trim();
871 if (key.startsWith(ARCHIVE_PACKAGE) ||
872 key.startsWith(HERITRIX_PROPERTIES_PREFIX)) {
873
874
875 String value = properties.getProperty(key).trim();
876 if (key.indexOf(".level") < 0) {
877 copyToSystemProperty(key, value);
878 }
879 } else if (key.startsWith(SYSTEM_PREFIX)) {
880 String value = properties.getProperty(key).trim();
881 copyToSystemProperty(key.substring(SYSTEM_PREFIX.length()), value);
882 }
883 }
884 return properties;
885 }
886
887 /***
888 * Copy the given key-value into System properties, as long as there
889 * is no existing value.
890 * @param key property key
891 * @param value property value
892 */
893 protected static void copyToSystemProperty(String key, String value) {
894 if (System.getProperty(key) == null ||
895 System.getProperty(key).length() == 0) {
896 System.setProperty(key, value);
897 }
898 }
899
900 protected static InputStream getPropertiesInputStream()
901 throws IOException {
902 File file = null;
903
904 String alternateProperties = System.getProperty(PROPERTIES_KEY);
905 if (alternateProperties != null && alternateProperties.length() > 0) {
906 file = new File(alternateProperties);
907 }
908
909 if ((file == null || !file.exists()) && getConfdir(false) != null) {
910 file = new File(getConfdir(), PROPERTIES);
911 if (!file.exists()) {
912
913
914 file = null;
915 }
916 }
917
918
919
920 InputStream is = (file != null)?
921 new FileInputStream(file):
922 Heritrix.class.getResourceAsStream("/" + PROPERTIES_KEY);
923 if (is == null) {
924 throw new IOException("Failed to load properties file from" +
925 " filesystem or from classpath.");
926 }
927 return is;
928 }
929
930 /***
931 * If the user hasn't altered the default logging parameters, tighten them
932 * up somewhat: some of our libraries are way too verbose at the INFO or
933 * WARNING levels.
934 *
935 * This might be a problem running inside in someone else's
936 * container. Container's seem to prefer commons logging so we
937 * ain't messing them doing the below.
938 *
939 * @throws IOException
940 * @throws SecurityException
941 */
942 protected static void patchLogging()
943 throws SecurityException, IOException {
944 if (System.getProperty("java.util.logging.config.class") != null) {
945 return;
946 }
947
948 if (System.getProperty("java.util.logging.config.file") != null) {
949 return;
950 }
951
952
953
954 LogManager.getLogManager().
955 readConfiguration(getPropertiesInputStream());
956 }
957
958 /***
959 * Configure our trust store.
960 *
961 * If system property is defined, then use it for our truststore. Otherwise
962 * use the heritrix truststore under conf directory if it exists.
963 *
964 * <p>If we're not launched from the command-line, we will not be able
965 * to find our truststore. The truststore is nor normally used so rare
966 * should this be a problem (In case where we don't use find our trust
967 * store, we'll use the 'default' -- either the JVMs or the containers).
968 */
969 protected static void configureTrustStore() {
970
971 final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore";
972 String value = System.getProperty(TRUSTSTORE_KEY);
973 File confdir = null;
974 try {
975 confdir = getConfdir(false);
976 } catch (IOException e) {
977 logger.log(Level.WARNING, "Failed to get confdir.", e);
978 }
979 if ((value == null || value.length() <= 0) && confdir != null) {
980
981 File heritrixStore = new File(confdir, "heritrix.cacerts");
982 if(heritrixStore.exists()) {
983 value = heritrixStore.getAbsolutePath();
984 }
985 }
986
987 if (value != null && value.length() > 0) {
988 System.setProperty(TRUSTSTORE_KEY, value);
989 }
990 }
991
992 /***
993 * Run the selftest
994 *
995 * @param oneSelfTestName Name of a test if we are to run one only rather
996 * than the default running all tests.
997 * @param port Port number to use for web UI.
998 *
999 * @exception Exception
1000 * @return Status of how selftest startup went.
1001 */
1002 protected static String selftest(final String oneSelfTestName,
1003 final int port)
1004 throws Exception {
1005
1006 final String SELFTEST = "selftest";
1007 Heritrix.httpServer = new SimpleHttpServer(SELFTEST,
1008 Heritrix.adminContext, LOCALHOST_ONLY, port, true);
1009
1010
1011
1012
1013
1014 Heritrix.httpServer.setAuthentication(SELFTEST, Heritrix.adminContext,
1015 SELFTEST, SELFTEST, SELFTEST);
1016 Heritrix.httpServer.startServer();
1017
1018
1019 File selftestDir = (isDevelopment())?
1020 new File(getConfdir(), SELFTEST):
1021 new File(File.separator + SELFTEST);
1022 File crawlOrderFile = new File(selftestDir, "order.xml");
1023
1024
1025
1026
1027 final String ROOTURI = "127.0.0.1:" + Integer.toString(port);
1028 String selfTestUrl = "http://" + ROOTURI + '/';
1029 if (oneSelfTestName != null && oneSelfTestName.length() > 0) {
1030 selfTestUrl += (oneSelfTestName + '/');
1031 }
1032 CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(),
1033 oneSelfTestName, selfTestUrl);
1034 Heritrix h = new Heritrix("Selftest", true, cjh);
1035 CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template");
1036 job = h.getJobHandler().newJob(job, null, SELFTEST,
1037 "Integration self test", selfTestUrl, CrawlJob.PRIORITY_AVERAGE);
1038 h.getJobHandler().addJob(job);
1039
1040 CredentialStore cs = (CredentialStore)job.getSettingsHandler().
1041 getOrder().getAttribute(CredentialStore.ATTR_NAME);
1042 for (Iterator i = cs.iterator(null); i.hasNext();) {
1043 ((Credential)i.next()).setCredentialDomain(null, ROOTURI);
1044 }
1045 h.getJobHandler().startCrawler();
1046 StringBuffer buffer = new StringBuffer();
1047 buffer.append("Heritrix " + Heritrix.getVersion() +
1048 " selftest started.");
1049 buffer.append("\nSelftest first crawls " + selfTestUrl +
1050 " and then runs an analysis.");
1051 buffer.append("\nResult of analysis printed to " +
1052 getHeritrixOut() + " when done.");
1053 buffer.append("\nSelftest job directory for logs and arcs:\n" +
1054 job.getDirectory().getAbsolutePath());
1055 return buffer.toString();
1056 }
1057
1058 /***
1059 * Launch the crawler without a web UI and run the passed crawl only.
1060 *
1061 * Specialized version of {@link #launch()}.
1062 *
1063 * @param crawlOrderFile The crawl order to crawl.
1064 * @throws InitializationException
1065 * @throws InvalidAttributeValueException
1066 * @return Status string.
1067 */
1068 protected String doOneCrawl(String crawlOrderFile)
1069 throws InitializationException, InvalidAttributeValueException {
1070 return doOneCrawl(crawlOrderFile, null);
1071 }
1072
1073 /***
1074 * Launch the crawler without a web UI and run passed crawl only.
1075 *
1076 * Specialized version of {@link #launch()}.
1077 *
1078 * @param crawlOrderFile The crawl order to crawl.
1079 * @param listener Register this crawl status listener before starting
1080 * crawl (You can use this listener to notice end-of-crawl).
1081 * @throws InitializationException
1082 * @throws InvalidAttributeValueException
1083 * @return Status string.
1084 */
1085 protected String doOneCrawl(String crawlOrderFile,
1086 CrawlStatusListener listener)
1087 throws InitializationException, InvalidAttributeValueException {
1088 XMLSettingsHandler handler =
1089 new XMLSettingsHandler(new File(crawlOrderFile));
1090 handler.initialize();
1091 CrawlController controller = new CrawlController();
1092 controller.initialize(handler);
1093 if (listener != null) {
1094 controller.addCrawlStatusListener(listener);
1095 }
1096 controller.requestCrawlStart();
1097 return "Crawl started using " + crawlOrderFile + ".";
1098 }
1099
1100 /***
1101 * Launch the crawler for a web UI.
1102 *
1103 * Crawler hangs around waiting on jobs.
1104 *
1105 * @exception Exception
1106 * @return A status string describing how the launch went.
1107 * @throws Exception
1108 */
1109 public String launch() throws Exception {
1110 return launch(null, false);
1111 }
1112
1113 /***
1114 * Launch the crawler for a web UI.
1115 *
1116 * Crawler hangs around waiting on jobs.
1117 *
1118 * @param crawlOrderFile File to crawl. May be null.
1119 * @param runMode Whether crawler should be set to run mode.
1120 *
1121 * @exception Exception
1122 * @return A status string describing how the launch went.
1123 */
1124 public String launch(String crawlOrderFile, boolean runMode)
1125 throws Exception {
1126 String status = null;
1127 if (crawlOrderFile != null) {
1128 addCrawlJob(crawlOrderFile, "Autolaunched", "", "");
1129 if(runMode) {
1130 this.jobHandler.startCrawler();
1131 status = "Job being crawled: " + crawlOrderFile;
1132 } else {
1133 status = "Crawl job ready and pending: " + crawlOrderFile;
1134 }
1135 } else if(runMode) {
1136
1137
1138
1139 this.jobHandler.startCrawler();
1140 status = "Crawler set to run mode.";
1141 }
1142 return status;
1143 }
1144
1145 /***
1146 * Start up the embedded Jetty webserver instance.
1147 * This is done when we're run from the command-line.
1148 * @param port Port number to use for web UI.
1149 * @param adminLoginPassword Compound of login and password.
1150 * @throws Exception
1151 * @return Status on webserver startup.
1152 * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword)
1153 */
1154 protected static String startEmbeddedWebserver(final int port,
1155 final boolean lho, final String adminLoginPassword)
1156 throws Exception {
1157 ArrayList<String> hosts = new ArrayList<String>();
1158 if (lho) {
1159 hosts.add("127.0.0.1");
1160 }
1161 return startEmbeddedWebserver(hosts, port, adminLoginPassword);
1162 }
1163
1164
1165 /***
1166 * Parses a list of host names.
1167 *
1168 * <p>If the given string is <code>/</code>, then an empty
1169 * collection is returned. This indicates that all available network
1170 * interfaces should be used.
1171 *
1172 * <p>Otherwise, the string must contain a comma-separated list of
1173 * IP addresses or host names. The parsed list is then returned.
1174 *
1175 * @param hosts the string to parse
1176 * @return the parsed collection of hosts
1177 */
1178 private static Collection<String> parseHosts(String hosts) {
1179 hosts = hosts.trim();
1180 if (hosts.equals("/")) {
1181 return new ArrayList<String>(1);
1182 }
1183 String[] hostArray = hosts.split(",");
1184 for (int i = 0; i < hostArray.length; i++) {
1185 hostArray[i] = hostArray[i].trim();
1186 }
1187 return Arrays.asList(hostArray);
1188 }
1189
1190 /***
1191 * Start up the embedded Jetty webserver instance.
1192 * This is done when we're run from the command-line.
1193 *
1194 * @param hosts a list of IP addresses or hostnames to bind to, or an
1195 * empty collection to bind to all available network
1196 * interfaces
1197 * @param port Port number to use for web UI.
1198 * @param adminLoginPassword Compound of login and password.
1199 * @throws Exception
1200 * @return Status on webserver startup.
1201 */
1202 protected static String startEmbeddedWebserver(Collection<String> hosts,
1203 int port, String adminLoginPassword)
1204 throws Exception {
1205 adminUsername = adminLoginPassword.
1206 substring(0, adminLoginPassword.indexOf(":"));
1207 adminPassword = adminLoginPassword.
1208 substring(adminLoginPassword.indexOf(":") + 1);
1209 Heritrix.httpServer = new SimpleHttpServer("admin",
1210 Heritrix.adminContext, hosts, port, false);
1211
1212 final String DOTWAR = ".war";
1213 final String SELFTEST = "selftest";
1214
1215
1216 File[] wars = getWarsdir().listFiles();
1217 for(int i = 0; i < wars.length; i++) {
1218 if(wars[i].isFile()) {
1219 final String warName = wars[i].getName();
1220 final String warNameNC = warName.toLowerCase();
1221 if(warNameNC.endsWith(DOTWAR) &&
1222 !warNameNC.equals(ADMIN + DOTWAR) &&
1223 !warNameNC.equals(SELFTEST + DOTWAR)) {
1224 int dot = warName.indexOf('.');
1225 Heritrix.httpServer.addWebapp(warName.substring(0, dot),
1226 null, true);
1227 }
1228 }
1229 }
1230
1231
1232
1233 final String ROLE = ADMIN;
1234 Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext,
1235 adminUsername, adminPassword, ROLE);
1236 Heritrix.httpServer.startServer();
1237 StringBuffer buffer = new StringBuffer();
1238 buffer.append("Heritrix " + Heritrix.getVersion() + " is running.");
1239 for (String host: httpServer.getHosts()) {
1240 buffer.append("\nWeb console is at: http://");
1241 buffer.append(host).append(':').append(port);
1242 }
1243 buffer.append("\nWeb console login and password: " +
1244 adminUsername + "/" + adminPassword);
1245 return buffer.toString();
1246 }
1247
1248 /***
1249 * Replace existing administrator login info with new info.
1250 *
1251 * @param newUsername new administrator login username
1252 * @param newPassword new administrator login password
1253 */
1254 public static void resetAuthentication(String newUsername,
1255 String newPassword) {
1256 Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,
1257 newUsername, newPassword);
1258 adminUsername = newUsername;
1259 adminPassword = newPassword;
1260 logger.info("administrative login changed to "
1261 +newUsername+":"+newPassword);
1262 }
1263
1264 protected static CrawlJob createCrawlJob(CrawlJobHandler handler,
1265 File crawlOrderFile, String name)
1266 throws InvalidAttributeValueException {
1267 XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile);
1268 settings.initialize();
1269 return new CrawlJob(handler.getNextJobUID(), name, settings,
1270 new CrawlJobErrorHandler(Level.SEVERE),
1271 CrawlJob.PRIORITY_HIGH,
1272 crawlOrderFile.getAbsoluteFile().getParentFile());
1273 }
1274
1275 /***
1276 * This method is called when we have an order file to hand that we want
1277 * to base a job on. It leaves the order file in place and just starts up
1278 * a job that uses all the order points to for locations for logs, etc.
1279 * @param orderPathOrUrl Path to an order file or to a seeds file.
1280 * @param name Name to use for this job.
1281 * @param description
1282 * @param seeds
1283 * @return A status string.
1284 * @throws IOException
1285 * @throws FatalConfigurationException
1286 */
1287 public String addCrawlJob(String orderPathOrUrl, String name,
1288 String description, String seeds)
1289 throws IOException, FatalConfigurationException {
1290 if (!UURI.hasScheme(orderPathOrUrl)) {
1291
1292 return addCrawlJob(new File(orderPathOrUrl), name, description,
1293 seeds);
1294 }
1295
1296
1297 URL url = new URL(orderPathOrUrl);
1298
1299
1300
1301
1302 String result = null;
1303 URLConnection connection = url.openConnection();
1304 if (connection instanceof HttpURLConnection) {
1305 result = addCrawlJob(url, (HttpURLConnection)connection, name,
1306 description, seeds);
1307 } else if (connection instanceof FileURLConnection) {
1308 result = addCrawlJob(new File(url.getPath()), name, description,
1309 seeds);
1310 } else {
1311 throw new UnsupportedOperationException("No support for "
1312 + connection);
1313 }
1314
1315 return result;
1316 }
1317
1318 protected String addCrawlJob(final URL url,
1319 final HttpURLConnection connection,
1320 final String name, final String description, final String seeds)
1321 throws IOException, FatalConfigurationException {
1322
1323 boolean isJar = url.getPath() != null &&
1324 url.getPath().toLowerCase().endsWith(JAR_SUFFIX);
1325
1326 File localFile = File.createTempFile(Heritrix.class.getName(),
1327 isJar? JAR_SUFFIX: null, TMPDIR);
1328 connection.connect();
1329 String result = null;
1330 try {
1331 IoUtils.readFullyToFile(connection.getInputStream(), localFile);
1332 result = addCrawlJob(localFile, name, description, seeds);
1333 } catch (IOException ioe) {
1334
1335 localFile.delete();
1336 localFile = null;
1337 } finally {
1338 connection.disconnect();
1339
1340
1341
1342 if (isJar && localFile != null && localFile.exists()) {
1343 localFile.delete();
1344 }
1345 }
1346 return result;
1347 }
1348
1349 protected String addCrawlJob(final File order, final String name,
1350 final String description, final String seeds)
1351 throws FatalConfigurationException, IOException {
1352 CrawlJob addedJob = null;
1353 if (this.jobHandler == null) {
1354 throw new NullPointerException("Heritrix jobhandler is null.");
1355 }
1356 try {
1357 if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) {
1358 return addCrawlJobBasedonJar(order, name, description, seeds);
1359 }
1360 addedJob = this.jobHandler.
1361 addJob(createCrawlJob(this.jobHandler, order, name));
1362 } catch (InvalidAttributeValueException e) {
1363 FatalConfigurationException fce = new FatalConfigurationException(
1364 "Converted InvalidAttributeValueException on " +
1365 order.getAbsolutePath() + ": " + e.getMessage());
1366 fce.setStackTrace(e.getStackTrace());
1367 }
1368 return addedJob != null? addedJob.getUID(): null;
1369 }
1370
1371 /***
1372 * Undo jar file and use as basis for a new job.
1373 * @param jarFile Pointer to file that holds jar.
1374 * @param name Name to use for new job.
1375 * @param description
1376 * @param seeds
1377 * @return Message.
1378 * @throws IOException
1379 * @throws FatalConfigurationException
1380 */
1381 protected String addCrawlJobBasedonJar(final File jarFile,
1382 final String name, final String description, final String seeds)
1383 throws IOException, FatalConfigurationException {
1384 if (jarFile == null || !jarFile.exists()) {
1385 throw new FileNotFoundException(jarFile.getAbsolutePath());
1386 }
1387
1388
1389
1390
1391 File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar",
1392 TMPDIR);
1393 dir.delete();
1394 dir.mkdir();
1395 try {
1396 org.archive.crawler.util.IoUtils.unzip(jarFile, dir);
1397
1398 File orderFile = new File(dir, "order.xml");
1399 if (!orderFile.exists()) {
1400 throw new IOException("Missing order: " +
1401 orderFile.getAbsolutePath());
1402 }
1403 CrawlJob job =
1404 createCrawlJobBasedOn(orderFile, name, description, seeds);
1405
1406
1407 File seedsFile = new File(dir, "seeds.txt");
1408 if (seedsFile.exists()) {
1409 FileUtils.copyFiles(seedsFile, new File(job.getDirectory(),
1410 seedsFile.getName()));
1411 }
1412 File settingsDir = new File(dir, "settings");
1413 if (settingsDir.exists()) {
1414 FileUtils.copyFiles(settingsDir, job.getDirectory());
1415 }
1416 addCrawlJob(job);
1417 return job.getUID();
1418 } finally {
1419
1420
1421
1422
1423
1424 org.archive.util.FileUtils.deleteDir(dir);
1425 }
1426 }
1427
1428 public String addCrawlJobBasedOn(String jobUidOrProfile,
1429 String name, String description, String seeds) {
1430 try {
1431 CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);
1432 if (cj == null) {
1433 throw new InvalidAttributeValueException(jobUidOrProfile +
1434 " is not a job UID or profile name (Job UIDs are " +
1435 " usually the 14 digit date portion of job name).");
1436 }
1437 CrawlJob job = addCrawlJobBasedOn(
1438 cj.getSettingsHandler().getOrderFile(), name, description,
1439 seeds);
1440 return job.getUID();
1441 } catch (Exception e) {
1442 e.printStackTrace();
1443 return "Exception on " + jobUidOrProfile + ": " + e.getMessage();
1444 }
1445 }
1446
1447 protected CrawlJob addCrawlJobBasedOn(final File orderFile,
1448 final String name, final String description, final String seeds)
1449 throws FatalConfigurationException {
1450 return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description,
1451 seeds));
1452 }
1453
1454 protected CrawlJob createCrawlJobBasedOn(final File orderFile,
1455 final String name, final String description, final String seeds)
1456 throws FatalConfigurationException {
1457 CrawlJob job = getJobHandler().newJob(orderFile, name, description,
1458 seeds);
1459 return CrawlJobHandler.ensureNewJobWritten(job, name, description);
1460 }
1461
1462 protected CrawlJob addCrawlJob(final CrawlJob job) {
1463 return getJobHandler().addJob(job);
1464 }
1465
1466 public void startCrawling() {
1467 if (getJobHandler() == null) {
1468 throw new NullPointerException("Heritrix jobhandler is null.");
1469 }
1470 getJobHandler().startCrawler();
1471 }
1472
1473 public void stopCrawling() {
1474 if (getJobHandler() == null) {
1475 throw new NullPointerException("Heritrix jobhandler is null.");
1476 }
1477 getJobHandler().stopCrawler();
1478 }
1479
1480 /***
1481 * Get the heritrix version.
1482 *
1483 * @return The heritrix version. May be null.
1484 */
1485 public static String getVersion() {
1486 return System.getProperty("heritrix.version");
1487 }
1488
1489 /***
1490 * Get the job handler
1491 *
1492 * @return The CrawlJobHandler being used.
1493 */
1494 public CrawlJobHandler getJobHandler() {
1495 return this.jobHandler;
1496 }
1497
1498 /***
1499 * Get the configuration directory.
1500 * @return The conf directory under HERITRIX_HOME or null if none can
1501 * be found.
1502 * @throws IOException
1503 */
1504 public static File getConfdir()
1505 throws IOException {
1506 return getConfdir(true);
1507 }
1508
1509 /***
1510 * Get the configuration directory.
1511 * @param fail Throw IOE if can't find directory if true, else just
1512 * return null.
1513 * @return The conf directory under HERITRIX_HOME or null (or an IOE) if
1514 * can't be found.
1515 * @throws IOException
1516 */
1517 public static File getConfdir(final boolean fail)
1518 throws IOException {
1519 final String key = "heritrix.conf";
1520
1521 String tmp = System.getProperty(key);
1522
1523 if (tmp == null || tmp.length() == 0) {
1524 return getSubDir("conf", fail);
1525 }
1526 File dir = new File(tmp);
1527 if (!dir.exists()) {
1528 if (fail) {
1529 throw new IOException("Cannot find conf dir: " + tmp);
1530 } else {
1531 logger.log(Level.WARNING, "Specified " + key +
1532 " dir does not exist. Falling back on default");
1533 }
1534 dir = getSubDir("conf", fail);
1535 }
1536 return dir;
1537 }
1538
1539 /***
1540 * @return Returns the httpServer. May be null if one was not started.
1541 */
1542 public static SimpleHttpServer getHttpServer() {
1543 return Heritrix.httpServer;
1544 }
1545
1546 /***
1547 * @throws IOException
1548 * @return Returns the directory under which reside the WAR files
1549 * we're to load into the servlet container.
1550 */
1551 public static File getWarsdir()
1552 throws IOException {
1553 return getSubDir("webapps");
1554 }
1555
1556 /***
1557 * Prepars for program shutdown. This method does it's best to prepare the
1558 * program so that it can exit normally. It will kill the httpServer and
1559 * terminate any running job.<br>
1560 * It is advisible to wait a few (~1000) millisec after calling this method
1561 * and before calling performHeritrixShutDown() to allow as many threads as
1562 * possible to finish what they are doing.
1563 */
1564 public static void prepareHeritrixShutDown() {
1565
1566
1567
1568 final Object [] keys = Heritrix.instances.keySet().toArray();
1569 for (int i = 0; i < keys.length; i++) {
1570 ((Heritrix)Heritrix.instances.get(keys[i])).destroy();
1571 }
1572
1573 try {
1574 deregisterJndi(getJndiContainerName());
1575 } catch (NameNotFoundException e) {
1576
1577 logger.log(Level.WARNING, "deregistration of jndi", e);
1578 } catch (Exception e) {
1579 e.printStackTrace();
1580 }
1581
1582 if(Heritrix.httpServer != null) {
1583
1584 try {
1585 Heritrix.httpServer.stopServer();
1586 } catch (InterruptedException e) {
1587
1588
1589 e.printStackTrace();
1590 } finally {
1591 Heritrix.httpServer = null;
1592 }
1593 }
1594 }
1595
1596 /***
1597 * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1598 * prior to this method.
1599 */
1600 public static void performHeritrixShutDown() {
1601 performHeritrixShutDown(0);
1602 }
1603
1604 /***
1605 * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1606 * prior to this method.
1607 *
1608 * @param exitCode Code to pass System.exit.
1609 *
1610 */
1611 public static void performHeritrixShutDown(int exitCode) {
1612 System.exit(exitCode);
1613 }
1614
1615 /***
1616 * Shutdown all running heritrix instances and the JVM.
1617 * Assumes stop has already been called.
1618 * @param exitCode Exit code to pass system exit.
1619 */
1620 public static void shutdown(final int exitCode) {
1621 getShutdownThread(true, exitCode, "Heritrix shutdown").start();
1622 }
1623
1624 protected static Thread getShutdownThread(final boolean sysexit,
1625 final int exitCode, final String name) {
1626 Thread t = new Thread(name) {
1627 public void run() {
1628 Heritrix.prepareHeritrixShutDown();
1629 if (sysexit) {
1630 Heritrix.performHeritrixShutDown(exitCode);
1631 }
1632 }
1633 };
1634 t.setDaemon(true);
1635 return t;
1636 }
1637
1638 public static void shutdown() {
1639 shutdown(0);
1640 }
1641
1642 /***
1643 * Register Heritrix with JNDI, JMX, and with the static hashtable of all
1644 * Heritrix instances known to this JVM.
1645 *
1646 * If launched from cmdline, register Heritrix MBean if an agent to register
1647 * ourselves with. Usually this method will only have effect if we're
1648 * running in a 1.5.0 JDK and command line options such as
1649 * '-Dcom.sun.management.jmxremote.port=8082
1650 * -Dcom.sun.management.jmxremote.authenticate=false
1651 * -Dcom.sun.management.jmxremote.ssl=false' are supplied.
1652 * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring
1653 * and Management Using JMX</a>
1654 * for more on the command line options and how to connect to the
1655 * Heritrix bean using the JDK 1.5.0 jconsole tool. We register currently
1656 * with first server we find (TODO: Make configurable).
1657 *
1658 * <p>If we register successfully with a JMX agent, then part of the
1659 * registration will include our registering ourselves with JNDI.
1660 *
1661 * <p>Finally, add the heritrix instance to the hashtable of all the
1662 * Heritrix instances floating in the current VM. This latter registeration
1663 * happens whether or no there is a JMX agent to register with. This is
1664 * a list we keep out of convenience so its easy iterating over all
1665 * all instances calling stop when main application is going down.
1666 *
1667 * @param h Instance of heritrix to register.
1668 * @param name Name to use for this Heritrix instance.
1669 * @param jmxregister True if we are to register this instance with JMX.
1670 * @throws NullPointerException
1671 * @throws MalformedObjectNameException
1672 * @throws NotCompliantMBeanException
1673 * @throws MBeanRegistrationException
1674 * @throws InstanceAlreadyExistsException
1675 */
1676 protected static void registerHeritrix(final Heritrix h,
1677 final String name, final boolean jmxregister)
1678 throws MalformedObjectNameException, InstanceAlreadyExistsException,
1679 MBeanRegistrationException, NotCompliantMBeanException {
1680 MBeanServer server = getMBeanServer();
1681 if (server != null) {
1682
1683
1684
1685 if (jmxregister) {
1686 ObjectName objName = (name == null || name.length() <= 0)?
1687 getJmxObjectName(): getJmxObjectName(name);
1688 registerMBean(server, h, objName);
1689 }
1690 } else {
1691
1692
1693
1694
1695 Heritrix.instances.put(h.getNoJmxName(), h);
1696 }
1697 }
1698
1699 protected static void unregisterHeritrix(final Heritrix h)
1700 throws InstanceNotFoundException, MBeanRegistrationException,
1701 NullPointerException {
1702 MBeanServer server = getMBeanServer();
1703 if (server != null) {
1704 server.unregisterMBean(h.mbeanName);
1705 } else {
1706
1707
1708 Heritrix.instances.remove(h.getNoJmxName());
1709 }
1710 }
1711
1712 /***
1713 * Get MBeanServer.
1714 * Currently uses first MBeanServer found. This will definetly not be whats
1715 * always wanted. TODO: Make which server settable. Also, if none, put up
1716 * our own MBeanServer.
1717 * @return An MBeanServer to register with or null.
1718 */
1719 public static MBeanServer getMBeanServer() {
1720 MBeanServer result = null;
1721 List servers = MBeanServerFactory.findMBeanServer(null);
1722 if (servers == null) {
1723 return result;
1724 }
1725 for (Iterator i = servers.iterator(); i.hasNext();) {
1726 MBeanServer server = (MBeanServer)i.next();
1727 if (server == null) {
1728 continue;
1729 }
1730 result = server;
1731 break;
1732 }
1733 return result;
1734 }
1735
1736 public static MBeanServer registerMBean(final Object objToRegister,
1737 final String name, final String type)
1738 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1739 NotCompliantMBeanException {
1740 MBeanServer server = getMBeanServer();
1741 if (server != null) {
1742 server = registerMBean(server, objToRegister, name, type);
1743 }
1744 return server;
1745 }
1746
1747 public static MBeanServer registerMBean(final MBeanServer server,
1748 final Object objToRegister, final String name, final String type)
1749 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1750 NotCompliantMBeanException {
1751 try {
1752 Hashtable<String,String> ht = new Hashtable<String,String>();
1753 ht.put(JmxUtils.NAME, name);
1754 ht.put(JmxUtils.TYPE, type);
1755 registerMBean(server, objToRegister,
1756 new ObjectName(CRAWLER_PACKAGE, ht));
1757 } catch (MalformedObjectNameException e) {
1758 e.printStackTrace();
1759 }
1760 return server;
1761 }
1762
1763 public static MBeanServer registerMBean(final MBeanServer server,
1764 final Object objToRegister, final ObjectName objName)
1765 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1766 NotCompliantMBeanException {
1767 server.registerMBean(objToRegister, objName);
1768 return server;
1769 }
1770
1771 public static void unregisterMBean(final MBeanServer server,
1772 final String name, final String type) {
1773 if (server == null) {
1774 return;
1775 }
1776 try {
1777 unregisterMBean(server, getJmxObjectName(name, type));
1778 } catch (MalformedObjectNameException e) {
1779 e.printStackTrace();
1780 }
1781 }
1782
1783 public static void unregisterMBean(final MBeanServer server,
1784 final ObjectName name) {
1785 try {
1786 server.unregisterMBean(name);
1787 logger.info("Unregistered bean " + name.getCanonicalName());
1788 } catch (InstanceNotFoundException e) {
1789 e.printStackTrace();
1790 } catch (MBeanRegistrationException e) {
1791 e.printStackTrace();
1792 } catch (NullPointerException e) {
1793 e.printStackTrace();
1794 }
1795 }
1796
1797 /***
1798 * @return Name to use when no JMX agent available.
1799 */
1800 protected String getNoJmxName() {
1801 return this.getClass().getName();
1802 }
1803
1804 public static ObjectName getJmxObjectName()
1805 throws MalformedObjectNameException, NullPointerException {
1806 return getJmxObjectName("Heritrix", JmxUtils.SERVICE);
1807 }
1808
1809 public static ObjectName getJmxObjectName(final String name)
1810 throws MalformedObjectNameException, NullPointerException {
1811 return getJmxObjectName(name, JmxUtils.SERVICE);
1812 }
1813
1814 public static ObjectName getJmxObjectName(final String name,
1815 final String type)
1816 throws MalformedObjectNameException, NullPointerException {
1817 Hashtable<String,String> ht = new Hashtable<String,String>();
1818 ht.put(JmxUtils.NAME, name);
1819 ht.put(JmxUtils.TYPE, type);
1820 return new ObjectName(CRAWLER_PACKAGE, ht);
1821 }
1822
1823 /***
1824 * @return Returns true if Heritrix was launched from the command line.
1825 * (When launched from command line, we do stuff like put up a web server
1826 * to manage our web interface and we register ourselves with the first
1827 * available jmx agent).
1828 */
1829 public static boolean isCommandLine() {
1830 return Heritrix.commandLine;
1831 }
1832
1833 /***
1834 * @return True if heritrix has been started.
1835 */
1836 public boolean isStarted() {
1837 return this.jobHandler != null;
1838 }
1839
1840 public String getStatus() {
1841 StringBuffer buffer = new StringBuffer();
1842 if (this.getJobHandler() != null) {
1843 buffer.append("isRunning=");
1844 buffer.append(this.getJobHandler().isRunning());
1845 buffer.append(" isCrawling=");
1846 buffer.append(this.getJobHandler().isCrawling());
1847 buffer.append(" alertCount=");
1848 buffer.append(getAlertsCount());
1849 buffer.append(" newAlertCount=");
1850 buffer.append(getNewAlertsCount());
1851 if (this.getJobHandler().isCrawling()) {
1852 buffer.append(" currentJob=");
1853 buffer.append(this.getJobHandler().getCurrentJob().
1854 getJmxJobName());
1855 }
1856 }
1857 return buffer.toString();
1858 }
1859
1860
1861 public int getAlertsCount() {
1862 return this.alertManager.getCount();
1863 }
1864
1865 public int getNewAlertsCount() {
1866 return this.alertManager.getNewCount();
1867 }
1868
1869 public Vector getAlerts() {
1870 return this.alertManager.getAll();
1871 }
1872
1873 public Vector getNewAlerts() {
1874 return this.alertManager.getNewAll();
1875 }
1876
1877 public SinkHandlerLogRecord getAlert(final String id) {
1878 return this.alertManager.get(id);
1879 }
1880
1881 public void readAlert(final String id) {
1882 this.alertManager.read(id);
1883 }
1884
1885 public void removeAlert(final String id) {
1886 this.alertManager.remove(id);
1887 }
1888
1889 /***
1890 * Start Heritrix.
1891 *
1892 * Used by JMX and webapp initialization for starting Heritrix.
1893 * Not by the cmdline launched Heritrix. Idempotent.
1894 * If start is called by JMX, then new instance of Heritrix is automatically
1895 * registered w/ JMX Agent. If started by webapp, need to register the new
1896 * Heritrix instance.
1897 */
1898 public void start() {
1899
1900
1901 if (!Heritrix.isCommandLine() && !isStarted()) {
1902 try {
1903 logger.info(launch());
1904 } catch (Exception e) {
1905 e.printStackTrace();
1906 }
1907 }
1908 }
1909
1910 /***
1911 * Stop Heritrix.
1912 *
1913 * Used by JMX and webapp initialization for stopping Heritrix.
1914 */
1915 public void stop() {
1916 if (this.jobHandler != null) {
1917 this.jobHandler.stop();
1918 }
1919 }
1920
1921 public String interrupt(String threadName) {
1922 String result = "Thread " + threadName + " not found";
1923 ThreadGroup group = Thread.currentThread().getThreadGroup();
1924 if (group == null) {
1925 return result;
1926 }
1927
1928
1929 ThreadGroup parent = null;
1930 while((parent = group.getParent()) != null) {
1931 group = parent;
1932 }
1933
1934
1935 final int max = group.activeCount() * 2;
1936 Thread [] threads = new Thread[max];
1937 int threadCount = group.enumerate(threads, true);
1938 if (threadCount >= max) {
1939 logger.info("Some threads not found...array too small: " +
1940 max);
1941 }
1942 for (int j = 0; j < threadCount; j++) {
1943 if (threads[j].getName().equals(threadName)) {
1944 threads[j].interrupt();
1945 result = "Interrupt sent to " + threadName;
1946 break;
1947 }
1948 }
1949 return result;
1950 }
1951
1952
1953
1954 /***
1955 * Build up the MBean info for Heritrix main.
1956 * @return Return created mbean info instance.
1957 */
1958 protected OpenMBeanInfoSupport buildMBeanInfo() {
1959 OpenMBeanAttributeInfoSupport[] attributes =
1960 new OpenMBeanAttributeInfoSupport[Heritrix.ATTRIBUTE_LIST.size()];
1961 OpenMBeanConstructorInfoSupport[] constructors =
1962 new OpenMBeanConstructorInfoSupport[1];
1963 OpenMBeanOperationInfoSupport[] operations =
1964 new OpenMBeanOperationInfoSupport[Heritrix.OPERATION_LIST.size()];
1965 MBeanNotificationInfo[] notifications =
1966 new MBeanNotificationInfo[0];
1967
1968
1969 attributes[0] =
1970 new OpenMBeanAttributeInfoSupport(Heritrix.STATUS_ATTR,
1971 "Short basic status message", SimpleType.STRING, true,
1972 false, false);
1973
1974 attributes[1] =
1975 new OpenMBeanAttributeInfoSupport(Heritrix.VERSION_ATTR,
1976 "Heritrix version", SimpleType.STRING, true, false, false);
1977
1978 attributes[2] =
1979 new OpenMBeanAttributeInfoSupport(Heritrix.ISRUNNING_ATTR,
1980 "Whether the crawler is running", SimpleType.BOOLEAN, true,
1981 false, false);
1982
1983 attributes[3] =
1984 new OpenMBeanAttributeInfoSupport(Heritrix.ISCRAWLING_ATTR,
1985 "Whether the crawler is crawling", SimpleType.BOOLEAN, true,
1986 false, false);
1987
1988 attributes[4] =
1989 new OpenMBeanAttributeInfoSupport(Heritrix.ALERTCOUNT_ATTR,
1990 "The number of alerts", SimpleType.INTEGER, true, false, false);
1991
1992 attributes[5] =
1993 new OpenMBeanAttributeInfoSupport(Heritrix.NEWALERTCOUNT_ATTR,
1994 "The number of new alerts", SimpleType.INTEGER, true, false,
1995 false);
1996
1997 attributes[6] =
1998 new OpenMBeanAttributeInfoSupport(Heritrix.CURRENTJOB_ATTR,
1999 "The name of the job currently being crawled",
2000 SimpleType.STRING, true, false, false);
2001
2002
2003 constructors[0] = new OpenMBeanConstructorInfoSupport(
2004 "HeritrixOpenMBean", "Constructs Heritrix OpenMBean instance ",
2005 new OpenMBeanParameterInfoSupport[0]);
2006
2007
2008 operations[0] = new OpenMBeanOperationInfoSupport(
2009 Heritrix.START_OPER, "Start Heritrix instance", null,
2010 SimpleType.VOID, MBeanOperationInfo.ACTION);
2011
2012 operations[1] = new OpenMBeanOperationInfoSupport(
2013 Heritrix.STOP_OPER, "Stop Heritrix instance", null,
2014 SimpleType.VOID, MBeanOperationInfo.ACTION);
2015
2016 OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[1];
2017 args[0] = new OpenMBeanParameterInfoSupport("threadName",
2018 "Name of thread to send interrupt", SimpleType.STRING);
2019 operations[2] = new OpenMBeanOperationInfoSupport(
2020 Heritrix.INTERRUPT_OPER, "Send thread an interrupt " +
2021 "(Used debugging)", args, SimpleType.STRING,
2022 MBeanOperationInfo.ACTION_INFO);
2023
2024 operations[3] = new OpenMBeanOperationInfoSupport(
2025 Heritrix.START_CRAWLING_OPER, "Set Heritrix instance " +
2026 "into crawling mode", null, SimpleType.VOID,
2027 MBeanOperationInfo.ACTION);
2028
2029 operations[4] = new OpenMBeanOperationInfoSupport(
2030 Heritrix.STOP_CRAWLING_OPER, "Unset Heritrix instance " +
2031 " crawling mode", null, SimpleType.VOID,
2032 MBeanOperationInfo.ACTION);
2033
2034 args = new OpenMBeanParameterInfoSupport[4];
2035 args[0] = new OpenMBeanParameterInfoSupport("pathOrURL",
2036 "Path/URL to order or jar of order+seed",
2037 SimpleType.STRING);
2038 args[1] = new OpenMBeanParameterInfoSupport("name",
2039 "Basename for new job", SimpleType.STRING);
2040 args[2] = new OpenMBeanParameterInfoSupport("description",
2041 "Description to save with new job", SimpleType.STRING);
2042 args[3] = new OpenMBeanParameterInfoSupport("seeds",
2043 "Initial seed(s)", SimpleType.STRING);
2044 operations[5] = new OpenMBeanOperationInfoSupport(
2045 Heritrix.ADD_CRAWL_JOB_OPER, "Add new crawl job", args,
2046 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2047
2048 args = new OpenMBeanParameterInfoSupport[4];
2049 args[0] = new OpenMBeanParameterInfoSupport("uidOrName",
2050 "Job UID or profile name", SimpleType.STRING);
2051 args[1] = new OpenMBeanParameterInfoSupport("name",
2052 "Basename for new job", SimpleType.STRING);
2053 args[2] = new OpenMBeanParameterInfoSupport("description",
2054 "Description to save with new job", SimpleType.STRING);
2055 args[3] = new OpenMBeanParameterInfoSupport("seeds",
2056 "Initial seed(s)", SimpleType.STRING);
2057 operations[6] = new OpenMBeanOperationInfoSupport(
2058 Heritrix.ADD_CRAWL_JOB_BASEDON_OPER,
2059 "Add a new crawl job based on passed Job UID or profile",
2060 args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2061
2062 args = new OpenMBeanParameterInfoSupport[1];
2063 args[0] = new OpenMBeanParameterInfoSupport("UID",
2064 "Job UID", SimpleType.STRING);
2065 operations[7] = new OpenMBeanOperationInfoSupport(DELETE_CRAWL_JOB_OPER,
2066 "Delete/stop this crawl job", args, SimpleType.VOID,
2067 MBeanOperationInfo.ACTION);
2068
2069 args = new OpenMBeanParameterInfoSupport[1];
2070 args[0] = new OpenMBeanParameterInfoSupport("index",
2071 "Zero-based index into array of alerts", SimpleType.INTEGER);
2072 operations[8] = new OpenMBeanOperationInfoSupport(
2073 Heritrix.ALERT_OPER, "Return alert at passed index", args,
2074 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2075
2076 try {
2077 this.jobCompositeType = new CompositeType("job",
2078 "Job attributes", JOB_KEYS,
2079 new String [] {"Job unique ID", "Job name", "Job status"},
2080 new OpenType [] {SimpleType.STRING, SimpleType.STRING,
2081 SimpleType.STRING});
2082 this.jobsTabularType = new TabularType("jobs", "List of jobs",
2083 this.jobCompositeType, new String [] {"uid"});
2084 } catch (OpenDataException e) {
2085
2086 throw new RuntimeException(e);
2087 }
2088 operations[9] = new OpenMBeanOperationInfoSupport(
2089 Heritrix.PENDING_JOBS_OPER,
2090 "List of pending jobs (or null if none)", null,
2091 this.jobsTabularType, MBeanOperationInfo.INFO);
2092 operations[10] = new OpenMBeanOperationInfoSupport(
2093 Heritrix.COMPLETED_JOBS_OPER,
2094 "List of completed jobs (or null if none)", null,
2095 this.jobsTabularType, MBeanOperationInfo.INFO);
2096
2097 args = new OpenMBeanParameterInfoSupport[2];
2098 args[0] = new OpenMBeanParameterInfoSupport("uid",
2099 "Job unique ID", SimpleType.STRING);
2100 args[1] = new OpenMBeanParameterInfoSupport("name",
2101 "Report name (e.g. crawl-report, etc.)",
2102 SimpleType.STRING);
2103 operations[11] = new OpenMBeanOperationInfoSupport(
2104 Heritrix.CRAWLEND_REPORT_OPER, "Return crawl-end report", args,
2105 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2106
2107 operations[12] = new OpenMBeanOperationInfoSupport(
2108 Heritrix.SHUTDOWN_OPER, "Shutdown container", null,
2109 SimpleType.VOID, MBeanOperationInfo.ACTION);
2110
2111 args = new OpenMBeanParameterInfoSupport[2];
2112 args[0] = new OpenMBeanParameterInfoSupport("level",
2113 "Log level: e.g. SEVERE, WARNING, etc.", SimpleType.STRING);
2114 args[1] = new OpenMBeanParameterInfoSupport("message",
2115 "Log message", SimpleType.STRING);
2116 operations[13] = new OpenMBeanOperationInfoSupport(Heritrix.LOG_OPER,
2117 "Add a log message", args, SimpleType.VOID,
2118 MBeanOperationInfo.ACTION);
2119
2120 operations[14] = new OpenMBeanOperationInfoSupport(
2121 Heritrix.DESTROY_OPER, "Destroy Heritrix instance", null,
2122 SimpleType.VOID, MBeanOperationInfo.ACTION);
2123
2124 operations[15] = new OpenMBeanOperationInfoSupport(
2125 Heritrix.TERMINATE_CRAWL_JOB_OPER,
2126 "Returns false if no current job", null, SimpleType.BOOLEAN,
2127 MBeanOperationInfo.ACTION);
2128
2129 operations[16] = new OpenMBeanOperationInfoSupport(
2130 Heritrix.REBIND_JNDI_OPER,
2131 "Rebinds this Heritrix with JNDI.", null,
2132 SimpleType.VOID, MBeanOperationInfo.ACTION);
2133
2134
2135 return new OpenMBeanInfoSupport(this.getClass().getName(),
2136 "Heritrix Main OpenMBean", attributes, constructors, operations,
2137 notifications);
2138 }
2139
2140 public Object getAttribute(String attribute_name)
2141 throws AttributeNotFoundException {
2142 if (attribute_name == null) {
2143 throw new RuntimeOperationsException(
2144 new IllegalArgumentException("Attribute name cannot be null"),
2145 "Cannot call getAttribute with null attribute name");
2146 }
2147 if (!Heritrix.ATTRIBUTE_LIST.contains(attribute_name)) {
2148 throw new AttributeNotFoundException("Attribute " +
2149 attribute_name + " is unimplemented.");
2150 }
2151
2152
2153
2154
2155 if (attribute_name.equals(STATUS_ATTR)) {
2156 return getStatus();
2157 }
2158 if (attribute_name.equals(VERSION_ATTR)) {
2159 return getVersion();
2160 }
2161
2162 if (attribute_name.equals(ISRUNNING_ATTR)) {
2163 return new Boolean(this.getJobHandler().isRunning());
2164 }
2165 if (attribute_name.equals(ISCRAWLING_ATTR)) {
2166 return new Boolean(this.getJobHandler().isCrawling());
2167 }
2168 if (attribute_name.equals(ALERTCOUNT_ATTR)) {
2169 return new Integer(getAlertsCount());
2170 }
2171 if (attribute_name.equals(NEWALERTCOUNT_ATTR)) {
2172 return new Integer(getNewAlertsCount());
2173 }
2174 if (attribute_name.equals(CURRENTJOB_ATTR)) {
2175 if (this.getJobHandler().isCrawling()) {
2176 return this.getJobHandler().getCurrentJob().getJmxJobName();
2177 }
2178 return null;
2179 }
2180 throw new AttributeNotFoundException("Attribute " +
2181 attribute_name + " not found.");
2182 }
2183
2184 public void setAttribute(Attribute attribute)
2185 throws AttributeNotFoundException {
2186 throw new AttributeNotFoundException("No attribute can be set in " +
2187 "this MBean");
2188 }
2189
2190 public AttributeList getAttributes(String [] attributeNames) {
2191 if (attributeNames == null) {
2192 throw new RuntimeOperationsException(
2193 new IllegalArgumentException("attributeNames[] cannot be " +
2194 "null"), "Cannot call getAttributes with null attribute " +
2195 "names");
2196 }
2197 AttributeList resultList = new AttributeList();
2198 if (attributeNames.length == 0) {
2199 return resultList;
2200 }
2201 for (int i = 0; i < attributeNames.length; i++) {
2202 try {
2203 Object value = getAttribute(attributeNames[i]);
2204 resultList.add(new Attribute(attributeNames[i], value));
2205 } catch (Exception e) {
2206 e.printStackTrace();
2207 }
2208 }
2209 return(resultList);
2210 }
2211
2212 public AttributeList setAttributes(AttributeList attributes) {
2213 return new AttributeList();
2214 }
2215
2216 public Object invoke(final String operationName, final Object[] params,
2217 final String[] signature)
2218 throws ReflectionException {
2219 if (operationName == null) {
2220 throw new RuntimeOperationsException(
2221 new IllegalArgumentException("Operation name cannot be null"),
2222 "Cannot call invoke with null operation name");
2223 }
2224
2225 if (logger.isLoggable(Level.INFO)) {
2226 String paramsString = "";
2227 for (Object o : params) {
2228 paramsString.concat("[" + o.toString() + "]");
2229 }
2230 logger.info("JMX invoke: " + operationName + " [" + paramsString
2231 + "]");
2232 }
2233
2234
2235
2236
2237 if (operationName.equals(START_OPER)) {
2238 JmxUtils.checkParamsCount(START_OPER, params, 0);
2239 start();
2240 return null;
2241 }
2242 if (operationName.equals(STOP_OPER)) {
2243 JmxUtils.checkParamsCount(STOP_OPER, params, 0);
2244 stop();
2245 return null;
2246 }
2247 if (operationName.equals(DESTROY_OPER)) {
2248 JmxUtils.checkParamsCount(DESTROY_OPER, params, 0);
2249 destroy();
2250 return null;
2251 }
2252 if (operationName.equals(TERMINATE_CRAWL_JOB_OPER)) {
2253 JmxUtils.checkParamsCount(TERMINATE_CRAWL_JOB_OPER, params, 0);
2254 return new Boolean(this.jobHandler.terminateCurrentJob());
2255 }
2256 if (operationName.equals(REBIND_JNDI_OPER)) {
2257 JmxUtils.checkParamsCount(REBIND_JNDI_OPER, params, 0);
2258 try {
2259 registerContainerJndi();
2260 } catch (MalformedObjectNameException e) {
2261 throw new RuntimeOperationsException(new RuntimeException(e));
2262 } catch (UnknownHostException e) {
2263 throw new RuntimeOperationsException(new RuntimeException(e));
2264 } catch (NamingException e) {
2265 throw new RuntimeOperationsException(new RuntimeException(e));
2266 }
2267 return null;
2268 }
2269 if (operationName.equals(SHUTDOWN_OPER)) {
2270 JmxUtils.checkParamsCount(SHUTDOWN_OPER, params, 0);
2271 Heritrix.shutdown();
2272 return null;
2273 }
2274 if (operationName.equals(LOG_OPER)) {
2275 JmxUtils.checkParamsCount(LOG_OPER, params, 2);
2276 logger.log(Level.parse((String)params[0]), (String)params[1]);
2277 return null;
2278 }
2279 if (operationName.equals(INTERRUPT_OPER)) {
2280 JmxUtils.checkParamsCount(INTERRUPT_OPER, params, 1);
2281 return interrupt((String)params[0]);
2282 }
2283 if (operationName.equals(START_CRAWLING_OPER)) {
2284 JmxUtils.checkParamsCount(START_CRAWLING_OPER, params, 0);
2285 startCrawling();
2286 return null;
2287 }
2288 if (operationName.equals(STOP_CRAWLING_OPER)) {
2289 JmxUtils.checkParamsCount(STOP_CRAWLING_OPER, params, 0);
2290 stopCrawling();
2291 return null;
2292 }
2293 if (operationName.equals(ADD_CRAWL_JOB_OPER)) {
2294 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_OPER, params, 4);
2295 try {
2296 return addCrawlJob((String)params[0], (String)params[1],
2297 checkForEmptyPlaceHolder((String)params[2]),
2298 checkForEmptyPlaceHolder((String)params[3]));
2299 } catch (IOException e) {
2300 throw new RuntimeOperationsException(new RuntimeException(e));
2301 } catch (FatalConfigurationException e) {
2302 throw new RuntimeOperationsException(new RuntimeException(e));
2303 }
2304 }
2305 if (operationName.equals(DELETE_CRAWL_JOB_OPER)) {
2306 JmxUtils.checkParamsCount(DELETE_CRAWL_JOB_OPER, params, 1);
2307 this.jobHandler.deleteJob((String)params[0]);
2308 return null;
2309 }
2310
2311 if (operationName.equals(ADD_CRAWL_JOB_BASEDON_OPER)) {
2312 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_BASEDON_OPER, params, 4);
2313 return addCrawlJobBasedOn((String)params[0], (String)params[1],
2314 checkForEmptyPlaceHolder((String)params[2]),
2315 checkForEmptyPlaceHolder((String)params[3]));
2316 }
2317 if (operationName.equals(ALERT_OPER)) {
2318 JmxUtils.checkParamsCount(ALERT_OPER, params, 1);
2319 SinkHandlerLogRecord slr = null;
2320 if (this.alertManager.getCount() > 0) {
2321
2322
2323
2324 slr = (SinkHandlerLogRecord)this.alertManager.getAll().
2325 get(((Integer)params[0]).intValue());
2326 }
2327 return (slr != null)? slr.toString(): null;
2328 }
2329
2330 if (operationName.equals(PENDING_JOBS_OPER)) {
2331 JmxUtils.checkParamsCount(PENDING_JOBS_OPER, params, 0);
2332 try {
2333 return makeJobsTabularData(getJobHandler().getPendingJobs());
2334 } catch (OpenDataException e) {
2335 throw new RuntimeOperationsException(new RuntimeException(e));
2336 }
2337 }
2338
2339 if (operationName.equals(COMPLETED_JOBS_OPER)) {
2340 JmxUtils.checkParamsCount(COMPLETED_JOBS_OPER, params, 0);
2341 try {
2342 return makeJobsTabularData(getJobHandler().getCompletedJobs());
2343 } catch (OpenDataException e) {
2344 throw new RuntimeOperationsException(new RuntimeException(e));
2345 }
2346 }
2347
2348 if (operationName.equals(CRAWLEND_REPORT_OPER)) {
2349 JmxUtils.checkParamsCount(CRAWLEND_REPORT_OPER, params, 2);
2350 try {
2351 return getCrawlendReport((String)params[0], (String) params[1]);
2352 } catch (IOException e) {
2353 throw new RuntimeOperationsException(new RuntimeException(e));
2354 }
2355 }
2356
2357 throw new ReflectionException(
2358 new NoSuchMethodException(operationName),
2359 "Cannot find the operation " + operationName);
2360 }
2361
2362 /***
2363 * Return named crawl end report for job with passed uid.
2364 * Crawler makes reports when its finished its crawl. Use this method
2365 * to get a String version of one of these files.
2366 * @param jobUid The unique ID for the job whose reports you want to see
2367 * (Must be a completed job).
2368 * @param reportName Name of report minus '.txt' (e.g. crawl-report).
2369 * @return String version of the on-disk report.
2370 * @throws IOException
2371 */
2372 protected String getCrawlendReport(String jobUid, String reportName)
2373 throws IOException {
2374 CrawlJob job = getJobHandler().getJob(jobUid);
2375 if (job == null) {
2376 throw new IOException("No such job: " + jobUid);
2377 }
2378 File report = new File(job.getDirectory(), reportName + ".txt");
2379 if (!report.exists()) {
2380 throw new FileNotFoundException(report.getAbsolutePath());
2381 }
2382 return FileUtils.readFileAsString(report);
2383 }
2384
2385 protected TabularData makeJobsTabularData(List jobs)
2386 throws OpenDataException {
2387 if (jobs == null || jobs.size() == 0) {
2388 return null;
2389 }
2390 TabularData td = new TabularDataSupport(this.jobsTabularType);
2391 for (Iterator i = jobs.iterator(); i.hasNext();) {
2392 CrawlJob job = (CrawlJob)i.next();
2393 CompositeData cd = new CompositeDataSupport(this.jobCompositeType,
2394 JOB_KEYS,
2395 new String [] {job.getUID(), job.getJobName(), job.getStatus()});
2396 td.put(cd);
2397 }
2398 return td;
2399 }
2400
2401 /***
2402 * If passed str has placeholder for the empty string, return the empty
2403 * string else return orginal.
2404 * Dumb jmx clients can't pass empty string so they'll pass a representation
2405 * of empty string such as ' ' or '-'. Convert such strings to empty
2406 * string.
2407 * @param str String to check.
2408 * @return Original <code>str</code> or empty string if <code>str</code>
2409 * contains a placeholder for the empty-string (e.g. '-', or ' ').
2410 */
2411 protected String checkForEmptyPlaceHolder(String str) {
2412 return TextUtils.matches("-| +", str)? "": str;
2413 }
2414
2415 public MBeanInfo getMBeanInfo() {
2416 return this.openMBeanInfo;
2417 }
2418
2419 /***
2420 * @return Name this instance registered in JMX (Only available after JMX
2421 * registration).
2422 */
2423 public ObjectName getMBeanName() {
2424 return this.mbeanName;
2425 }
2426
2427 public ObjectName preRegister(MBeanServer server, ObjectName name)
2428 throws Exception {
2429 this.mbeanServer = server;
2430 @SuppressWarnings("unchecked")
2431 Hashtable<String,String> ht = name.getKeyPropertyList();
2432 if (!ht.containsKey(JmxUtils.NAME)) {
2433 throw new IllegalArgumentException("Name property required" +
2434 name.getCanonicalName());
2435 }
2436 if (!ht.containsKey(JmxUtils.TYPE)) {
2437 ht.put(JmxUtils.TYPE, JmxUtils.SERVICE);
2438 name = new ObjectName(name.getDomain(), ht);
2439 }
2440 this.mbeanName = addGuiPort(addVitals(name));
2441 Heritrix.instances.put(this.mbeanName.
2442 getCanonicalKeyPropertyListString(), this);
2443 return this.mbeanName;
2444 }
2445
2446 /***
2447 * Add vital stats to passed in ObjectName.
2448 * @param name ObjectName to add to.
2449 * @return name with host, guiport, and jmxport added.
2450 * @throws UnknownHostException
2451 * @throws MalformedObjectNameException
2452 * @throws NullPointerException
2453 */
2454 protected static ObjectName addVitals(ObjectName name)
2455 throws UnknownHostException, MalformedObjectNameException,
2456 NullPointerException {
2457 @SuppressWarnings("unchecked")
2458 Hashtable<String,String> ht = name.getKeyPropertyList();
2459 if (!ht.containsKey(JmxUtils.HOST)) {
2460 ht.put(JmxUtils.HOST, InetAddress.getLocalHost().getHostName());
2461 name = new ObjectName(name.getDomain(), ht);
2462 }
2463 if (!ht.containsKey(JmxUtils.JMX_PORT)) {
2464
2465
2466
2467
2468 String p = System.getProperty("com.sun.management.jmxremote.port");
2469 if (p != null && p.length() > 0) {
2470 ht.put(JmxUtils.JMX_PORT, p);
2471 name = new ObjectName(name.getDomain(), ht);
2472 }
2473 }
2474 return name;
2475 }
2476
2477 protected static ObjectName addGuiPort(ObjectName name)
2478 throws MalformedObjectNameException, NullPointerException {
2479 @SuppressWarnings("unchecked")
2480 Hashtable<String,String> ht = name.getKeyPropertyList();
2481 if (!ht.containsKey(JmxUtils.GUI_PORT)) {
2482
2483 if (Heritrix.gui) {
2484 ht.put(JmxUtils.GUI_PORT, Integer.toString(Heritrix.guiPort));
2485 name = new ObjectName(name.getDomain(), ht);
2486 }
2487 }
2488 return name;
2489 }
2490
2491 public void postRegister(Boolean registrationDone) {
2492 if (logger.isLoggable(Level.INFO)) {
2493 logger.info(
2494 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2495 this.mbeanServer, registrationDone.booleanValue()));
2496 }
2497 try {
2498 registerJndi(this.mbeanName);
2499 } catch (Exception e) {
2500 logger.log(Level.SEVERE, "Failed jndi registration", e);
2501 }
2502 }
2503
2504 public void preDeregister() throws Exception {
2505 deregisterJndi(this.mbeanName);
2506 }
2507
2508 public void postDeregister() {
2509 Heritrix.instances.
2510 remove(this.mbeanName.getCanonicalKeyPropertyListString());
2511 if (logger.isLoggable(Level.INFO)) {
2512 logger.info(JmxUtils.getLogUnregistrationMsg(
2513 this.mbeanName.getCanonicalName(), this.mbeanServer));
2514 }
2515 }
2516
2517 protected static void registerContainerJndi()
2518 throws MalformedObjectNameException, NullPointerException,
2519 UnknownHostException, NamingException {
2520 registerJndi(getJndiContainerName());
2521 }
2522
2523 protected static void registerJndi(final ObjectName name)
2524 throws NullPointerException, NamingException {
2525 Context c = getJndiContext();
2526 if (c == null) {
2527 return;
2528 }
2529 CompoundName key = JndiUtils.bindObjectName(c, name);
2530 if (logger.isLoggable(Level.FINE)) {
2531 logger.fine("Bound '" + key + "' to '" + JndiUtils.
2532 getCompoundName(c.getNameInNamespace()).toString()
2533 + "' jndi context");
2534 }
2535 }
2536
2537 protected static void deregisterJndi(final ObjectName name)
2538 throws NullPointerException, NamingException {
2539 Context c = getJndiContext();
2540 if (c == null) {
2541 return;
2542 }
2543 CompoundName key = JndiUtils.unbindObjectName(c, name);
2544 if (logger.isLoggable(Level.FINE)) {
2545 logger.fine("Unbound '" + key + "' from '" +
2546 JndiUtils.getCompoundName(c.getNameInNamespace()).toString() +
2547 "' jndi context");
2548 }
2549 }
2550
2551 /***
2552 * @return Jndi context for the crawler or null if none found.
2553 * @throws NamingException
2554 */
2555 protected static Context getJndiContext() throws NamingException {
2556 Context c = null;
2557 try {
2558 c = JndiUtils.getSubContext(CRAWLER_PACKAGE);
2559 } catch (NoInitialContextException e) {
2560 logger.fine("No JNDI Context: " + e.toString());
2561 }
2562 return c;
2563 }
2564
2565 /***
2566 * @return Jndi container name -- the name to use for the 'container' that
2567 * can host zero or more heritrix instances (Return a JMX ObjectName. We
2568 * use ObjectName because then we're sync'd with JMX naming and ObjectName
2569 * has nice parsing).
2570 * @throws NullPointerException
2571 * @throws MalformedObjectNameException
2572 * @throws UnknownHostException
2573 */
2574 protected static ObjectName getJndiContainerName()
2575 throws MalformedObjectNameException, NullPointerException,
2576 UnknownHostException {
2577 ObjectName objName = new ObjectName(CRAWLER_PACKAGE, "type",
2578 "container");
2579 return addVitals(objName);
2580 }
2581
2582 /***
2583 * @return Return all registered instances of Heritrix (Rare are there
2584 * more than one).
2585 */
2586 public static Map getInstances() {
2587 return Heritrix.instances;
2588 }
2589
2590 /***
2591 * @return True if only one instance of Heritrix.
2592 */
2593 public static boolean isSingleInstance() {
2594 return Heritrix.instances != null && Heritrix.instances.size() == 1;
2595 }
2596
2597 /***
2598 * @return Returns single instance or null if no instance or multiple.
2599 */
2600 public static Heritrix getSingleInstance() {
2601 return !isSingleInstance()?
2602 null:
2603 (Heritrix)Heritrix.instances.
2604 get(Heritrix.instances.keySet().iterator().next());
2605 }
2606 }