View Javadoc

1   /* CrawlJob
2    *
3    * Copyright (C) 2003 Internet Archive.
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   */
21  package org.archive.crawler.admin;
22  
23  import java.io.BufferedReader;
24  import java.io.File;
25  import java.io.FileNotFoundException;
26  import java.io.FileReader;
27  import java.io.FileWriter;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.io.InputStreamReader;
31  import java.io.PrintWriter;
32  import java.io.Serializable;
33  import java.io.StringWriter;
34  import java.util.ArrayList;
35  import java.util.Arrays;
36  import java.util.Collection;
37  import java.util.EventObject;
38  import java.util.Hashtable;
39  import java.util.Iterator;
40  import java.util.List;
41  import java.util.Map;
42  import java.util.logging.Level;
43  import java.util.logging.Logger;
44  
45  import javax.management.Attribute;
46  import javax.management.AttributeList;
47  import javax.management.AttributeNotFoundException;
48  import javax.management.DynamicMBean;
49  import javax.management.InstanceAlreadyExistsException;
50  import javax.management.InvalidAttributeValueException;
51  import javax.management.MBeanAttributeInfo;
52  import javax.management.MBeanException;
53  import javax.management.MBeanInfo;
54  import javax.management.MBeanNotificationInfo;
55  import javax.management.MBeanOperationInfo;
56  import javax.management.MBeanParameterInfo;
57  import javax.management.MBeanRegistration;
58  import javax.management.MBeanRegistrationException;
59  import javax.management.MBeanServer;
60  import javax.management.NotCompliantMBeanException;
61  import javax.management.Notification;
62  import javax.management.NotificationBroadcasterSupport;
63  import javax.management.ObjectName;
64  import javax.management.ReflectionException;
65  import javax.management.RuntimeOperationsException;
66  import javax.management.openmbean.CompositeData;
67  import javax.management.openmbean.CompositeDataSupport;
68  import javax.management.openmbean.CompositeType;
69  import javax.management.openmbean.OpenDataException;
70  import javax.management.openmbean.OpenMBeanAttributeInfo;
71  import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
72  import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
73  import javax.management.openmbean.OpenMBeanInfoSupport;
74  import javax.management.openmbean.OpenMBeanOperationInfo;
75  import javax.management.openmbean.OpenMBeanOperationInfoSupport;
76  import javax.management.openmbean.OpenMBeanParameterInfo;
77  import javax.management.openmbean.OpenMBeanParameterInfoSupport;
78  import javax.management.openmbean.SimpleType;
79  
80  import org.apache.commons.httpclient.URIException;
81  import org.apache.commons.io.IOUtils;
82  import org.archive.crawler.Heritrix;
83  import org.archive.crawler.datamodel.CandidateURI;
84  import org.archive.crawler.datamodel.Checkpoint;
85  import org.archive.crawler.datamodel.CrawlOrder;
86  import org.archive.crawler.event.CrawlStatusListener;
87  import org.archive.crawler.framework.CrawlController;
88  import org.archive.crawler.framework.FrontierMarker;
89  import org.archive.crawler.framework.StatisticsTracking;
90  import org.archive.crawler.framework.exceptions.InitializationException;
91  import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
92  import org.archive.crawler.frontier.AbstractFrontier;
93  import org.archive.crawler.settings.ComplexType;
94  import org.archive.crawler.settings.ModuleAttributeInfo;
95  import org.archive.crawler.settings.TextField;
96  import org.archive.crawler.settings.XMLSettingsHandler;
97  import org.archive.crawler.util.CheckpointUtils;
98  import org.archive.crawler.util.IoUtils;
99  import org.archive.util.ArchiveUtils;
100 import org.archive.util.FileUtils;
101 import org.archive.util.JEMBeanHelper;
102 import org.archive.util.JmxUtils;
103 import org.archive.util.iterator.LineReadingIterator;
104 import org.archive.util.iterator.RegexpLineIterator;
105 
106 import com.sleepycat.je.DatabaseException;
107 import com.sleepycat.je.Environment;
108 
109 /***
110  * A CrawlJob encapsulates a 'crawl order' with any and all information and
111  * methods needed by a CrawlJobHandler to accept and execute them.
112  *
113  * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
114  * should not be executed as a crawl but can be edited and used as a template
115  * for creating new CrawlJobs.
116  *
117  * <p>All of it's constructors are protected since only a CrawlJobHander
118  * should construct new CrawlJobs.
119  *
120  * @author Kristinn Sigurdsson
121  *
122  * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
123  * String, String, String, int)
124  * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
125  *  String, String, String)
126  */
127 
128 public class CrawlJob extends NotificationBroadcasterSupport
129 implements DynamicMBean, MBeanRegistration, CrawlStatusListener, Serializable {
130     /***
131      * Eclipse generated serial number.
132      */
133     private static final long serialVersionUID = 3411161000452525856L;
134     
135     private static final Logger logger =
136         Logger.getLogger(CrawlJob.class.getName());
137     /*
138      * Possible values for Priority
139      */
140     /*** lowest */
141     public static final int PRIORITY_MINIMAL = 0;
142     /*** low */
143     public static final int PRIORITY_LOW = 1;
144     /*** average */
145     public static final int PRIORITY_AVERAGE = 2;
146     /*** high */
147     public static final int PRIORITY_HIGH = 3;
148     /*** highest */
149     public static final int PRIORITY_CRITICAL = 4;
150 
151     /*
152      * Possible states for a Job.
153      */
154     /*** Inital value. May not be ready to run/incomplete. */
155     public static final String STATUS_CREATED = "Created";
156     /*** Job has been successfully submitted to a CrawlJobHandler */
157     public static final String STATUS_PENDING = "Pending";
158     /*** Job is being crawled */
159     public static final String STATUS_RUNNING = "Running";
160     /*** Job was deleted by user, will not be displayed in UI. */
161     public static final String STATUS_DELETED = "Deleted";
162     /*** Job was terminted by user input while crawling */
163     public static final String STATUS_ABORTED = "Finished - Ended by operator";
164     /*** Something went very wrong */
165     public static final String STATUS_FINISHED_ABNORMAL =
166         "Finished - Abnormal exit from crawling";
167     /*** Job finished normally having completed its crawl. */
168     public static final String STATUS_FINISHED = "Finished";
169     /*** Job finished normally when the specified timelimit was hit. */
170     public static final String STATUS_FINISHED_TIME_LIMIT =
171         "Finished - Timelimit hit";
172     /*** Job finished normally when the specifed amount of 
173      * data (MB) had been downloaded */
174     public static final String STATUS_FINISHED_DATA_LIMIT =
175         "Finished - Maximum amount of data limit hit";
176     /*** Job finished normally when the specified number of documents had been
177      * fetched.
178      */
179     public static final String STATUS_FINISHED_DOCUMENT_LIMIT =
180         "Finished - Maximum number of documents limit hit";
181     /*** Job is going to be temporarly stopped after active threads are finished. */
182     public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " +
183         "Waiting for threads to finish";
184     /*** Job was temporarly stopped. State is kept so it can be resumed */
185     public static final String STATUS_PAUSED = "Paused";
186     /***
187      * Job is being checkpointed.  When finished checkpointing, job is set
188      * back to STATUS_PAUSED (Job must be first paused before checkpointing
189      * will run).
190      */
191     public static final String STATUS_CHECKPOINTING = "Checkpointing";
192     /*** Job could not be launced due to an InitializationException */
193     public static final String STATUS_MISCONFIGURED = "Could not launch job " +
194         "- Fatal InitializationException";
195     /*** Job is actually a profile */
196     public static final String STATUS_PROFILE = "Profile";
197     
198     public static final String STATUS_PREPARING = "Preparing";
199 
200     // Class variables
201     private String UID;       //A UID issued by the CrawlJobHandler.
202     private String name;
203     private String status;
204     private boolean isReadOnly = false;
205     private boolean isNew = true;
206     private boolean isProfile = false;
207     private boolean isRunning = false;
208     private int priority;
209     private int numberOfJournalEntries = 0;
210     
211     private String statisticsFileSave = "";
212 
213     private String errorMessage = null;
214 
215     private File jobDir = null;
216 
217     private transient CrawlJobErrorHandler errorHandler = null;
218 
219     protected transient XMLSettingsHandler settingsHandler;
220     
221     private transient CrawlController controller = null;
222     
223     private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";
224     private static final String CRAWL_LOG_STYLE = "crawlLog";
225     
226     // OpenMBean support.
227 
228     /***
229      * Server we registered with. Maybe null.
230      */
231     private transient MBeanServer mbeanServer = null;
232     private transient ObjectName mbeanName = null;
233     private static final String CRAWLJOB_JMXMBEAN_TYPE =
234         JmxUtils.SERVICE + ".Job";
235     private transient JEMBeanHelper bdbjeMBeanHelper = null;
236     private transient List<String> bdbjeAttributeNameList = null;
237     private transient List<String> bdbjeOperationsNameList = null;
238     
239     
240     /***
241      * The MBean we've registered ourselves with (May be null
242      * throughout life of Heritrix).
243      */
244     private transient OpenMBeanInfoSupport openMBeanInfo;
245     
246     private final static String NAME_ATTR = "Name";
247     private final static String UID_ATTR = "UID";
248     private final static String STATUS_ATTR = "Status";
249     private final static String FRONTIER_SHORT_REPORT_ATTR =
250         "FrontierShortReport";
251     private final static String THREADS_SHORT_REPORT_ATTR =
252         "ThreadsShortReport";
253     private final static String TOTAL_DATA_ATTR = "TotalData";
254     private final static String CRAWL_TIME_ATTR = "CrawlTime";
255     private final static String DOC_RATE_ATTR = "DocRate";
256     private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
257     private final static String KB_RATE_ATTR = "KbRate";
258     private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate";
259     private final static String THREAD_COUNT_ATTR = "ThreadCount";
260     private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount";
261     private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount";
262     private final static String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR,
263         STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR,
264         TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR,
265         CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR,
266         THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR};
267     private final static List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY);
268     
269     private final static String IMPORT_URI_OPER = "importUri";
270     private final static String IMPORT_URIS_OPER = "importUris";
271     private final static String DUMP_URIS_OPER = "dumpUris";
272     private final static String PAUSE_OPER = "pause";
273     private final static String RESUME_OPER = "resume";
274     private final static String FRONTIER_REPORT_OPER = "frontierReport";
275     private final static String THREADS_REPORT_OPER = "threadsReport";
276     private final static String SEEDS_REPORT_OPER = "seedsReport";
277     private final static String CHECKPOINT_OPER = "startCheckpoint";
278     private final static String PROGRESS_STATISTICS_OPER =
279         "progressStatistics";
280     private final static String PROGRESS_STATISTICS_LEGEND_OPER =
281         "progressStatisticsLegend";
282     
283     private final static String PROG_STATS = "progressStatistics";
284     
285     // Same as JEMBeanHelper.OP_DB_STAT
286     private final static String OP_DB_STAT = "getDatabaseStats";
287     
288     /***
289      * Don't add the following crawl-order items.
290      */
291     private final static List ORDER_EXCLUDE;
292     static {
293         ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent",
294             "extract-processors", "DNS", "uri-included-structure"});
295     }
296     
297     /***
298      * Sequence number for jmx notifications.
299      */
300     private static int notificationsSequenceNumber = 1;
301     
302     /***
303      * A shutdown Constructor.
304      */
305     protected CrawlJob() {
306         super();
307     }
308 
309     /***
310      * A constructor for jobs.
311      *
312      * <p> Create, ready to crawl, jobs.
313      * @param UID A unique ID for this job. Typically emitted by the
314      *            CrawlJobHandler.
315      * @param name The name of the job
316      * @param settingsHandler The associated settings
317      * @param errorHandler The crawl jobs settings error handler.
318      *           <tt>null</tt> means none is set
319      * @param priority job priority.
320      * @param dir The directory that is considered this jobs working directory.
321      */
322     public CrawlJob(final String UID,
323             final String name, final XMLSettingsHandler settingsHandler,
324             final CrawlJobErrorHandler errorHandler, final int priority,
325             final File dir) {
326         this(UID, name, settingsHandler, errorHandler,
327                 priority, dir, null, false, true);
328     }
329 
330     /***
331      * A constructor for profiles.
332      *
333      * <p> Any job created with this constructor will be
334      * considered a profile. Profiles are not stored on disk (only their
335      * settings files are stored on disk). This is because their data is
336      * predictible given any settings files.
337      * @param UIDandName A unique ID for this job. For profiles this is the same
338      *           as name
339      * @param settingsHandler The associated settings
340      * @param errorHandler The crawl jobs settings error handler.
341      *           <tt>null</tt> means none is set
342      */
343     protected CrawlJob(final String UIDandName,
344             final XMLSettingsHandler settingsHandler,
345             final CrawlJobErrorHandler errorHandler) {
346         this(UIDandName, UIDandName, settingsHandler, errorHandler,
347             PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
348     }
349     
350     public CrawlJob(final String UID,
351             final String name, final XMLSettingsHandler settingsHandler,
352             final CrawlJobErrorHandler errorHandler, final int priority,
353             final File dir, final String status, final boolean isProfile,
354             final boolean isNew) {
355         super();
356         this.UID = UID;
357         this.name = name;
358         this.settingsHandler = settingsHandler;
359         this.errorHandler = errorHandler;
360         this.status = status;
361         this.isProfile = isProfile;
362         this.isNew = isNew;
363         this.jobDir = dir;
364         this.priority = priority;
365     }
366 
367     /***
368      * A constructor for reloading jobs from disk. Jobs (not profiles) have
369      * their data written to persistent storage in the file system. This method
370      * is used to load the job from such storage. This is done by the
371      * <code>CrawlJobHandler</code>.
372      * <p>
373      * Proper structure of a job file (TODO: Maybe one day make this an XML file)
374      * Line 1. UID <br>
375      * Line 2. Job name (string) <br>
376      * Line 3. Job status (string) <br>
377      * Line 4. is job read only (true/false) <br>
378      * Line 5. is job running (true/false) <br>
379      * Line 6. job priority (int) <br>
380      * Line 7. number of journal entries <br>
381      * Line 8. setting file (with path) <br>
382      * Line 9. statistics tracker file (with path) <br>
383      * Line 10-?. error message (String, empty for null), can be many lines <br>
384      * @param jobFile
385      *            a file containing information about the job to load.
386      * @param errorHandler The crawl jobs settings error handler.
387      *            null means none is set
388      * @throws InvalidJobFileException
389      *            if the specified file does not refer to a valid job file.
390      * @throws IOException
391      *            if io operations fail
392      */
393     protected CrawlJob(final File jobFile,
394             final CrawlJobErrorHandler errorHandler)
395             throws InvalidJobFileException, IOException {
396         this(null, null, null, errorHandler,
397                 PRIORITY_AVERAGE, null, null, false, true);
398         this.jobDir = jobFile.getParentFile();
399         
400         // Check for corrupt job.state files (can be corrupt if we crash).
401         if (jobFile.length() == 0) {
402             throw new InvalidJobFileException(jobFile.getCanonicalPath() +
403                 " is corrupt (length is zero)");
404         }
405         
406         // Open file. Read data and set up class variables accordingly...
407         BufferedReader jobReader =
408             new BufferedReader(new FileReader(jobFile), 4096);
409         // UID
410         this.UID = jobReader.readLine();
411         // name
412         this.name = jobReader.readLine();
413         // status
414         this.status = jobReader.readLine();
415         if(status.equals(STATUS_ABORTED)==false
416                 && status.equals(STATUS_CREATED)==false
417                 && status.equals(STATUS_DELETED)==false
418                 && status.equals(STATUS_FINISHED)==false
419                 && status.equals(STATUS_FINISHED_ABNORMAL)==false
420                 && status.equals(STATUS_FINISHED_DATA_LIMIT)==false
421                 && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false
422                 && status.equals(STATUS_FINISHED_TIME_LIMIT)==false
423                 && status.equals(STATUS_MISCONFIGURED)==false
424                 && status.equals(STATUS_PAUSED)==false
425                 && status.equals(STATUS_CHECKPOINTING)==false
426                 && status.equals(STATUS_PENDING)==false
427                 && status.equals(STATUS_RUNNING)==false
428                 && status.equals(STATUS_WAITING_FOR_PAUSE)==false
429                 && status.equals(STATUS_PREPARING)==false){
430             // status is invalid. Must be one of the above
431             throw new InvalidJobFileException("Status (line 3) in job file " +
432                     "is not valid: '" + status + "'");
433         }
434         // isReadOnly
435         String tmp = jobReader.readLine();
436         if(tmp.equals("true")){
437             isReadOnly = true;
438         } else if(tmp.equals("false")){
439             isReadOnly = false;
440         } else {
441             throw new InvalidJobFileException("isReadOnly (line 4) in job" +
442                     " file '" + jobFile.getAbsolutePath() + "' is not " +
443                     "valid: '" + tmp + "'");
444         }
445         // isRunning
446         tmp = jobReader.readLine();
447         if(tmp.equals("true")){
448             this.isRunning = true;
449         } else if(tmp.equals("false")){
450             this.isRunning = false;
451         } else {
452             throw new InvalidJobFileException("isRunning (line 5) in job " +
453                     "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
454                     "'" + tmp + "'");
455         }
456         // priority
457         tmp = jobReader.readLine();
458         try{
459             this.priority = Integer.parseInt(tmp);
460         } catch(NumberFormatException e){
461             throw new InvalidJobFileException("priority (line 5) in job " +
462                     "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
463                     "'" + tmp + "'");
464         }
465         // numberOfJournalEntries
466         tmp = jobReader.readLine();
467         try{
468             this.numberOfJournalEntries = Integer.parseInt(tmp);
469         } catch(NumberFormatException e){
470             throw new InvalidJobFileException("numberOfJournalEntries " +
471                     "(line 5) in job file '" + jobFile.getAbsolutePath() +
472                     "' is not valid: " + "'" + tmp + "'");
473         }
474         // settingsHandler
475         tmp = jobReader.readLine();
476         try {
477             File f = new File(tmp);
478             this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())?
479                 f: new File(jobDir, f.getName()));
480             if(this.errorHandler != null){
481                 this.settingsHandler.registerValueErrorHandler(errorHandler);
482             }
483             this.settingsHandler.initialize();
484         } catch (InvalidAttributeValueException e1) {
485             throw new InvalidJobFileException("Problem reading from settings " +
486                     "file (" + tmp + ") specified in job file '" +
487                     jobFile.getAbsolutePath() + "'\n" + e1.getMessage());
488         }
489         // Statistics tracker.
490         jobReader.readLine();
491         // errorMessage
492         // TODO: Multilines
493         tmp = jobReader.readLine();
494         errorMessage = "";
495         while(tmp!=null){
496             errorMessage+=tmp+'\n';
497             tmp = jobReader.readLine();
498         }
499         if(errorMessage.length()==0){
500             // Empty error message should be null
501             errorMessage = null;
502         }
503         // TODO: Load stattrack if needed.
504 
505         // TODO: This should be inside a finally block.
506         jobReader.close();
507     }
508 
509     /***
510      * Cause the job to be written to persistent storage.
511      * This will also save the statistics tracker if it is not null and the
512      * job status is finished (regardless of how it's finished)
513      */
514     private void writeJobFile() {
515         if (isProfile) {
516             return;
517         }
518         
519         final String jobDirAbsolute = jobDir.getAbsolutePath();
520         if (!jobDir.exists() || !jobDir.canWrite()) {
521             logger.warning("Can't update status on " +
522                 jobDirAbsolute + " because file does not" +
523                 " exist (or is unwriteable)");
524             return;
525         }
526         File f = new File(jobDirAbsolute, "state.job");
527 
528         String settingsFile = getSettingsDirectory();
529         // Make settingsFile's path relative if order.xml is somewhere in the
530         // job's directory tree
531         if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) {
532             settingsFile = settingsFile.substring(jobDirAbsolute.length()+1);
533         }
534         try {
535             FileWriter jobWriter = new FileWriter(f, false);
536             try {
537                 jobWriter.write(UID + "\n");
538                 jobWriter.write(name + "\n");
539                 jobWriter.write(status + "\n");
540                 jobWriter.write(isReadOnly + "\n");
541                 jobWriter.write(isRunning + "\n");
542                 jobWriter.write(priority + "\n");
543                 jobWriter.write(numberOfJournalEntries + "\n");
544                 jobWriter.write(settingsFile + "\n");
545                 jobWriter.write(statisticsFileSave + "\n");// TODO: Is this
546                                                             // right?
547                 // Can be multiple lines so we keep it last
548                 if (errorMessage != null) {
549                     jobWriter.write(errorMessage + "\n");
550                 }
551             } finally {
552                 if (jobWriter != null) {
553                     jobWriter.close();
554                 }
555             }
556         } catch (IOException e) {
557             logger.log(Level.WARNING, "An IOException occured saving job " +
558                     name + " (" + UID + ")", e);
559         }
560     }
561   
562     /***
563      * Returns this jobs unique ID (UID) that was issued by the
564      * CrawlJobHandler() when this job was first created.
565      * 
566      * @return Job This jobs UID.
567      * @see CrawlJobHandler#getNextJobUID()
568      */
569     public String getUID(){
570         return UID;
571     }
572 
573     /***
574      * Returns this job's 'name'. The name comes from the settings for this job,
575      * need not be unique and may change. For a unique identifier use
576      * {@link #getUID() getUID()}.
577      * <p>
578      * The name corrisponds to the value of the 'name' tag in the 'meta' section
579      * of the settings file.
580      *
581      * @return This job's 'name'
582      */
583     public String getJobName(){
584         return name;
585     }
586 
587     /***
588      * Return the combination of given name and UID most commonly
589      * used in administrative interface.
590      *
591      * @return Job's name with UID notation
592      */
593     public String getDisplayName() {
594         return getJobName()+" ["+getUID()+"]";
595     }
596 
597     /***
598      * Set this job's level of priority.
599      *
600      * @param priority The level of priority
601      *
602      * @see #getJobPriority()
603      * @see #PRIORITY_MINIMAL
604      * @see #PRIORITY_LOW
605      * @see #PRIORITY_AVERAGE
606      * @see #PRIORITY_HIGH
607      * @see #PRIORITY_CRITICAL
608      */
609     public void setJobPriority(int priority) {
610         this.priority = priority;
611     }
612 
613     /***
614      * Get this job's level of priority.
615      *
616      * @return this job's priority
617      * @see #setJobPriority(int)
618      * @see #PRIORITY_MINIMAL
619      * @see #PRIORITY_LOW
620      * @see #PRIORITY_AVERAGE
621      * @see #PRIORITY_HIGH
622      * @see #PRIORITY_CRITICAL
623      */
624     public int getJobPriority() {
625         return priority;
626     }
627 
628     /***
629      * Once called no changes can be made to the settings for this job.
630      * Typically this is done once a crawl is completed and further changes
631      * to the crawl order are therefor meaningless.
632      */
633     public void setReadOnly() {
634         isReadOnly = true;
635         writeJobFile(); //Save changes
636     }
637 
638     /***
639      * Is job read only?
640      * @return false until setReadOnly has been invoked, after that it returns true.
641      */
642     public boolean isReadOnly(){
643         return isReadOnly;
644     }
645 
646     /***
647      * Set the status of this CrawlJob.
648      *
649      * @param status Current status of CrawlJob
650      *         (see constants defined here beginning with STATUS)
651      */
652     public void setStatus(String status) {
653         this.status = status;
654         writeJobFile(); //Save changes
655         // TODO: If job finished, save StatisticsTracker!
656     }
657 
658     /***
659      * @return Status of the crawler (Used by JMX).
660      */
661     public String getCrawlStatus() {
662         return this.controller != null?
663             this.controller.getState().toString(): "Illegal State";
664     }
665     
666     /***
667      * Get the current status of this CrawlJob
668      *
669      * @return The current status of this CrawlJob
670      *         (see constants defined here beginning with STATUS)
671      */
672     public String getStatus() {
673         return this.status;
674     }
675 
676     /***
677      * Returns the settings handler for this job. It will have been initialized.
678      * @return the settings handler for this job.
679      */
680     public XMLSettingsHandler getSettingsHandler() {
681         return this.settingsHandler;
682     }
683     /***
684      * Is this a new job?
685      * @return True if is new.
686      */
687     public boolean isNew() {
688         return isNew;
689     }
690 
691     /***
692      * Set if the job is considered to be a profile
693      * @return True if is a profile.
694      */
695     public boolean isProfile() {
696         return isProfile;
697     }
698 
699     /***
700      * Set if the job is considered a new job or not.
701      * @param b Is the job considered to be new.
702      */
703     public void setNew(boolean b) {
704         isNew = b;
705         writeJobFile(); //Save changes
706     }
707 
708     /***
709      * Returns true if the job is being crawled.
710      * @return true if the job is being crawled
711      */
712     public boolean isRunning() {
713         return isRunning;
714     }
715 
716     /***
717      * Set if job is being crawled.
718      * @param b Is job being crawled.
719      */
720     protected void setRunning(boolean b) {
721         isRunning = b;
722         writeJobFile(); // Save changes
723         //TODO: Job ending -> Save statistics tracker.
724         //TODO: This is likely to happen as the CrawlEnding event occurs,
725         // need to ensure that the StatisticsTracker is saved to disk on
726         // CrawlEnded. Maybe move responsibility for this into the
727         // StatisticsTracker?
728     }
729     
730     protected void unregisterMBean() {
731         // Unregister current job from JMX agent, if there one.
732         if (this.mbeanServer == null) {
733             return;
734         }
735         try {
736             this.mbeanServer.unregisterMBean(this.mbeanName);
737             this.mbeanServer = null;
738         } catch (Exception e) {
739             logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e);
740         }
741     }
742     
743     /***
744      * Subclass of crawlcontroller that unregisters beans when stopped.
745      * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
746      * pollution, so for sure CrawlJob is unregistered with JMX and so any
747      * listeners on the CrawlJob get a chance to get crawl ended message
748      * (These latter notifications may not actually be getting through -- TBD).
749      * <p>TODO: This override dirtys the data model since CC knows about CJs.
750      * The facility provided by this class emitting events and statistics so
751      * they can be read by JMX needs to go back into CC.  Probably best to
752      * registering in JMX the CC, rather than CJ.  Lets do this in Heritrix 2.0
753      * since means changing the JMX API some.
754      */
755     public class MBeanCrawlController extends CrawlController
756     implements Serializable {
757         private static final long serialVersionUID = -4608537998168407222L;
758         private CrawlJob cj = null;
759         private CompositeType ct =  null;
760         
761         public CrawlJob getCrawlJob() {
762             return this.cj;
763         }
764 
765         public void setCrawlJob(CrawlJob cj) {
766             this.cj = cj;
767         }
768         
769         public void progressStatisticsEvent(final EventObject e) {
770             super.progressStatisticsEvent(e);
771             if (this.cj.getMbeanName() == null) {
772                 // Can be null around job startup.  Return w/o doing anything.
773                 return;
774             }
775                 
776             Map s = ((StatisticsTracking)e.getSource()).getProgressStatistics();
777             // Convert the statistics to OpenType CompositeData and add as
778             // user data to Notification.
779             CompositeData cd = null;
780             try {
781                 if (this.ct == null) {
782                     this.ct = JmxUtils.createCompositeType(s, PROG_STATS,
783                         PROG_STATS + " for " + this.cj.getMbeanName());
784                 }
785                 cd = new CompositeDataSupport(this.ct, s);
786             } catch (OpenDataException ode) {
787                 ode.printStackTrace();
788             }
789             if (cd != null) {
790                 Notification n = new Notification(PROG_STATS,
791                     this.cj.getMbeanName(), getNotificationsSequenceNumber(),
792                     ((StatisticsTracking)e.getSource()).
793                         getProgressStatisticsLine());
794                 n.setUserData(cd);
795                 this.cj.sendNotification(n);
796             }
797         }
798         
799         protected void completeStop() {
800             try {
801                 super.completeStop();
802             } finally {
803                 if (this.cj != null) {
804                     this.cj.unregisterMBean();
805                 }
806                 this.cj = null;
807             }
808         }
809     }
810     
811     protected CrawlController setupCrawlController()
812     throws InitializationException {
813         CrawlController controller = null;
814         
815         // Check if we're to do a checkpoint recover.  If so, deserialize
816         // the checkpoint's CrawlController and use that in place of a new
817         // CrawlController instance.
818         Checkpoint cp = CrawlController.
819             getCheckpointRecover(getSettingsHandler().getOrder());
820         if (cp != null) {
821             try {
822             	controller = (MBeanCrawlController)CheckpointUtils.
823                     readObjectFromFile(MBeanCrawlController.class,
824                         cp.getDirectory());
825             } catch (FileNotFoundException e) {
826                 throw new InitializationException(e);
827             } catch (IOException e) {
828                 throw new InitializationException(e);
829             } catch (ClassNotFoundException e) {
830                 throw new InitializationException(e);
831             }
832         } else {
833         	controller = new MBeanCrawlController();
834         }
835         return controller;
836     }
837     
838     protected CrawlController createCrawlController() {
839     	return new MBeanCrawlController();
840     }
841     
842     public void setupForCrawlStart()
843     throws InitializationException {
844         try {
845         	this.controller = setupCrawlController();
846             // Register as listener to get job finished notice.
847             this.controller.addCrawlStatusListener(this);
848             this.controller.initialize(getSettingsHandler());
849             // Set the crawl job this MBeanCrawlController needs to worry about.
850             ((MBeanCrawlController)this.controller).setCrawlJob(this);
851             // Create our mbean description and register our crawljob.
852             this.openMBeanInfo = buildMBeanInfo();
853             try {
854                 Heritrix.registerMBean(this, getJmxJobName(),
855                     CRAWLJOB_JMXMBEAN_TYPE);
856             } catch (InstanceAlreadyExistsException e) {
857                 throw new InitializationException(e);
858             } catch (MBeanRegistrationException e) {
859                 throw new InitializationException(e);
860             } catch (NotCompliantMBeanException e) {
861                 throw new InitializationException(e);
862             }
863         } catch (InitializationException e) {
864             // Can't load current job since it is misconfigured.
865             setStatus(CrawlJob.STATUS_MISCONFIGURED);
866             setErrorMessage("A fatal InitializationException occured when "
867                     + "loading job:\n" + e.getMessage());
868             // Log to stdout so its seen in logs as well as in UI.
869             e.printStackTrace();
870             this.controller = null;
871             throw e;
872         }
873         setStatus(CrawlJob.STATUS_RUNNING);
874         setRunning(true);
875     }
876     
877     public void stopCrawling() {
878         if(this.controller != null) {
879             this.controller.requestCrawlStop();
880         }
881     }
882 
883     /***
884      * @return One-line Frontier report.
885      */
886     public String getFrontierOneLine() {
887         if (this.controller == null || this.controller.getFrontier() == null) {
888             return "Crawler not running";
889         }
890         return this.controller.getFrontier().singleLineReport();
891     }
892     
893     /***
894      * @param reportName Name of report to write.
895      * @return A report of the frontier's status.
896      */
897     public String getFrontierReport(final String reportName) {
898         if (this.controller == null || this.controller.getFrontier() == null) {
899             return "Crawler not running";
900         }
901         return ArchiveUtils.writeReportToString(this.controller.getFrontier(),
902                 reportName);
903     }
904     
905     /***
906      * Write the requested frontier report to the given PrintWriter
907      * @param reportName Name of report to write.
908      * @param writer Where to write to.
909      */
910     public void writeFrontierReport(String reportName, PrintWriter writer) {
911         if (this.controller == null || this.controller.getFrontier() == null) {
912             writer.println("Crawler not running.");
913             return;
914         }
915         this.controller.getFrontier().reportTo(reportName,writer);
916     }
917 
918     /***
919      * @return One-line threads report.
920      */
921     public String getThreadOneLine() {
922         if (this.controller == null) {
923             return "Crawler not running";
924         }
925         return this.controller.oneLineReportThreads();
926     }
927     
928     /***
929      * Get the CrawlControllers ToeThreads report for the running crawl.
930      * @return The CrawlControllers ToeThreads report
931      */
932     public String getThreadsReport() {
933         if (this.controller == null) {
934             return "Crawler not running";
935         }
936         return ArchiveUtils.writeReportToString(this.controller.getToePool(),
937                 null);
938     }
939     
940     /***
941      * Write the requested threads report to the given PrintWriter
942      * @param reportName Name of report to write.
943      * @param writer Where to write to.
944      */
945     public void writeThreadsReport(String reportName, PrintWriter writer) {
946         if (this.controller == null || this.controller.getFrontier() == null) {
947             writer.println("Crawler not running.");
948             return;
949         }
950         this.controller.getToePool().reportTo(reportName, writer);
951     }
952     
953     /***
954      * Kills a thread. For details see
955      * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
956      * ToePool.killThread(int, boolean)}.
957      * @param threadNumber Thread to kill.
958      * @param replace Should thread be replaced.
959      * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
960      */
961     public void killThread(int threadNumber, boolean replace) {
962         if (this.controller ==  null) {
963             return;
964         }
965         this.controller.killThread(threadNumber, replace);
966     }
967 
968     /***
969      * Get the Processors report for the running crawl.
970      * @return The Processors report for the running crawl.
971      */
972     public String getProcessorsReport() {
973         if (this.controller == null) {
974             return "Crawler not running";
975         }
976         return ArchiveUtils.writeReportToString(this.controller,
977                 CrawlController.PROCESSORS_REPORT);
978     }
979     
980     /***
981      * Returns the directory where the configuration files for this job are
982      * located.
983      *
984      * @return the directory where the configuration files for this job are
985      *         located
986      */
987     public String getSettingsDirectory() {
988         return settingsHandler.getOrderFile().getPath();
989     }
990 
991     /***
992      * Returns the path of the job's base directory. For profiles this is always
993      * equal to <code>new File(getSettingsDirectory())</code>.
994      * @return the path of the job's base directory.
995      */
996     public File getDirectory(){
997         return isProfile? new File(getSettingsDirectory()): jobDir;
998     }
999 
1000     /***
1001      * Get the error message associated with this job. Will return null if there
1002      * is no error message.
1003      * @return the error message associated with this job
1004      */
1005     public String getErrorMessage() {
1006         return errorMessage;
1007     }
1008 
1009     /***
1010      * Set an error message for this job. Generally this only occurs if the job
1011      * is misconfigured.
1012      * @param string the error message associated with this job
1013      */
1014     public void setErrorMessage(String string) {
1015         errorMessage = string;
1016         writeJobFile(); //Save changes
1017     }
1018 
1019     /***
1020      * @return Returns the number of journal entries.
1021      */
1022     public int getNumberOfJournalEntries() {
1023         return numberOfJournalEntries;
1024     }
1025 
1026     /***
1027      * @param numberOfJournalEntries The number of journal entries to set.
1028      */
1029     public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1030         this.numberOfJournalEntries = numberOfJournalEntries;
1031         writeJobFile();
1032     }
1033 
1034     /***
1035      * @return Returns the error handler for this crawl job
1036      */
1037     public CrawlJobErrorHandler getErrorHandler() {
1038         return errorHandler;
1039     }
1040 
1041     /***
1042      * Read all the checkpoints found in the job's checkpoints
1043      * directory into Checkpoint instances
1044      * @return Collection containing list of all checkpoints.
1045      */
1046     public Collection scanCheckpoints() {
1047         File checkpointsDirectory =
1048             settingsHandler.getOrder().getCheckpointsDirectory();
1049         File[] perCheckpointDirs = checkpointsDirectory.listFiles();
1050         Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();
1051         if (perCheckpointDirs != null) {
1052             for (int i = 0; i < perCheckpointDirs.length; i++) {
1053                 Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1054                 checkpoints.add(cp);
1055             }
1056         }
1057         return checkpoints;
1058     }
1059 
1060     /***
1061      * Returns the absolute path of the specified log.
1062      * Note: If crawl has not begun, this file may not exist.
1063      * @param log
1064      * @return the absolute path for the specified log.
1065      * @throws AttributeNotFoundException
1066      * @throws ReflectionException
1067      * @throws MBeanException
1068      */
1069     public String getLogPath(String log) 
1070     throws AttributeNotFoundException, MBeanException, ReflectionException {
1071         String logsPath = (String)settingsHandler.getOrder().
1072             getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1073         CrawlOrder order = settingsHandler.getOrder();
1074         String diskPath = (String) order.getAttribute(null,
1075             CrawlOrder.ATTR_DISK_PATH);
1076         File disk = settingsHandler.
1077             getPathRelativeToWorkingDirectory(diskPath);
1078         File f = new File(logsPath, log);
1079         if (!f.isAbsolute()) {
1080             f = new File(disk.getPath(), f.getPath());
1081         }
1082         return f.getAbsolutePath();
1083     }
1084 
1085     // OpenMBean implementation.
1086     
1087     protected void pause() {
1088         if (this.controller != null && this.controller.isPaused() == false) {
1089             this.controller.requestCrawlPause();
1090         }
1091     }
1092     
1093     protected void resume() {
1094         if (this.controller != null) {
1095             this.controller.requestCrawlResume();
1096         }
1097     }
1098 
1099     /***
1100      * @throws IllegalStateException Thrown if crawl is not paused.
1101      */
1102     protected void checkpoint() throws IllegalStateException {
1103         if (this.controller != null) {
1104             this.controller.requestCrawlCheckpoint();
1105         }
1106     }
1107     
1108     /***
1109      * @return True if checkpointing.
1110      */
1111     public boolean isCheckpointing() {
1112         return this.controller != null? this.controller.isCheckpointing(): false;
1113     }
1114     
1115     /***
1116      * If its a HostQueuesFrontier, needs to be flushed for the queued.
1117      */
1118     protected void flush() {
1119         // Nothing to do.
1120     }
1121 
1122     /***
1123      * Delete any URI from the frontier of the current (paused) job that match
1124      * the specified regular expression. If the current job is not paused (or
1125      * there is no current job) nothing will be done.
1126      * @param regexpr Regular expression to delete URIs by.
1127      * @return the number of URIs deleted
1128      */
1129     public long deleteURIsFromPending(String regexpr){
1130         return deleteURIsFromPending(regexpr,null);
1131     }
1132     
1133     /***
1134      * Delete any URI from the frontier of the current (paused) job that match
1135      * the specified regular expression. If the current job is not paused (or
1136      * there is no current job) nothing will be done.
1137      * @param regexpr Regular expression to delete URIs by.
1138      * @return the number of URIs deleted
1139      */
1140     public long deleteURIsFromPending(String uriPattern, String queuePattern){
1141         return (this.controller != null &&
1142                 this.controller.getFrontier() != null &&
1143                 this.controller.isPaused())?
1144             this.controller.getFrontier().deleteURIs(uriPattern,queuePattern): 0;
1145     }
1146     
1147     public String importUris(String file, String style, String force) {
1148         return importUris(file, style, "true".equals(force));
1149     }
1150     
1151     public String importUris(final String fileOrUrl, final String style,
1152             final boolean forceRevisit) {
1153         return importUris(fileOrUrl, style, forceRevisit, false);
1154     }
1155 
1156     /***
1157      * @param fileOrUrl Name of file w/ seeds.
1158      * @param style What style of seeds -- crawl log, recovery journal, or
1159      * seeds file.
1160      * @param forceRevisit Should we revisit even if seen before?
1161      * @param areSeeds Is the file exclusively seeds?
1162      * @return A display string that has a count of all added.
1163      */
1164     public String importUris(final String fileOrUrl, final String style,
1165             final boolean forceRevisit, final boolean areSeeds) {
1166         InputStream is =
1167             IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl);
1168         String message = null;
1169         // Do we have an inputstream?
1170         if (is == null) {
1171             message = "Failed to get inputstream from " + fileOrUrl;
1172             logger.severe(message);
1173         } else {
1174             int addedCount = importUris(is, style, forceRevisit, areSeeds);
1175             message = Integer.toString(addedCount) + " URIs added from " +
1176                 fileOrUrl;
1177         }
1178         return message;
1179     }
1180     
1181     protected int importUris(InputStream is, String style,
1182             boolean forceRevisit) {
1183         return importUris(is, style, forceRevisit, false);
1184     }
1185     
1186     /***
1187      * Import URIs.
1188      * @param is Stream to use as URI source.
1189      * @param style Style in which URIs are rendored.  Currently support for
1190      * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1191      * format (i.e <code>default</code>) where <code>default</code> style is
1192      * a UURI per line (comments allowed).
1193      * @param forceRevisit Whether we should revisit this URI even if we've
1194      * visited it previously.
1195      * @param areSeeds Are the imported URIs seeds?
1196      * @return Count of added URIs.
1197      */
1198     protected int importUris(InputStream is, String style,
1199             boolean forceRevisit, final boolean areSeeds) {
1200         // Figure the regex to use parsing each line of input stream.
1201         String extractor;
1202         String output;
1203         if(CRAWL_LOG_STYLE.equals(style)) {
1204             // Skip first 3 fields
1205             extractor = "//S+//s+//S+//s+//S+//s+(//S+//s+//S+//s+//S+//s+).*";
1206             output = "$1";
1207         } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1208             // Skip the begin-of-line directive
1209             extractor = "//S+//s+((//S+)(?://s+//S+//s+//S+)?)//s*";
1210             output = "$1";
1211         } else {
1212             extractor =
1213                 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1214             output = RegexpLineIterator.ENTRY;
1215         }
1216         
1217         controller.installThreadContextSettingsHandler();
1218         
1219         // Read the input stream.
1220         BufferedReader br = null;
1221         int addedCount = 0;
1222         try {
1223             br = new BufferedReader(new InputStreamReader(is));
1224             Iterator iter = new RegexpLineIterator(new LineReadingIterator(br),
1225                 RegexpLineIterator.COMMENT_LINE, extractor, output);
1226             while(iter.hasNext()) {
1227                 try {
1228                     importUri((String)iter.next(), forceRevisit, areSeeds,
1229                         false);
1230                     addedCount++;
1231                 } catch (URIException e) {
1232                     e.printStackTrace();
1233                 }
1234             }
1235             br.close();
1236             flush();
1237         } catch (IOException e) {
1238             e.printStackTrace();
1239         }
1240         return addedCount;
1241     }
1242     
1243     /***
1244      * Schedule a uri.
1245      * @param uri Uri to schedule.
1246      * @param forceFetch Should it be forcefetched.
1247      * @param isSeed True if seed.
1248      * @throws URIException
1249      */
1250     public void importUri(final String uri, final boolean forceFetch,
1251             final boolean isSeed)
1252     throws URIException {
1253         importUri(uri, forceFetch, isSeed, true);
1254     }
1255     
1256     /***
1257      * Schedule a uri.
1258      * @param str String that can be: 1. a UURI, 2. a snippet of the
1259      * crawl.log line, or 3. a snippet from recover log.  See
1260      * {@link #importUris(InputStream, String, boolean)} for how it subparses
1261      * the lines from crawl.log and recover.log.
1262      * @param forceFetch Should it be forcefetched.
1263      * @param isSeed True if seed.
1264      * @param isFlush If true, flush the frontier IF it implements
1265      * flushing.
1266      * @throws URIException
1267      */
1268     public void importUri(final String str, final boolean forceFetch,
1269             final boolean isSeed, final boolean isFlush)
1270     throws URIException {
1271         CandidateURI caUri = CandidateURI.fromString(str);
1272         caUri.setForceFetch(forceFetch);
1273         if (isSeed) {
1274             caUri.setIsSeed(isSeed);
1275             if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1276                 // Danger of double-add of seeds because of this code here.
1277                 // Only call addSeed if no via.  If a via, the schedule will
1278                 // take care of updating scope.
1279                 this.controller.getScope().addSeed(caUri);
1280             }
1281         }
1282         this.controller.getFrontier().schedule(caUri);
1283         if (isFlush) {
1284             flush();
1285         }
1286     }
1287     
1288     
1289     /***
1290      * @return Our mbean info (Needed for CrawlJob to qualify as a
1291      * DynamicMBean).
1292      */
1293     public MBeanInfo getMBeanInfo() {
1294         return this.openMBeanInfo;
1295     }
1296     
1297     /***
1298      * Build up the MBean info for Heritrix main.
1299      * @return Return created mbean info instance.
1300      * @throws InitializationException 
1301      */
1302     protected OpenMBeanInfoSupport buildMBeanInfo()
1303     throws InitializationException {
1304         // Start adding my attributes.
1305         List<OpenMBeanAttributeInfo> attributes
1306          = new ArrayList<OpenMBeanAttributeInfo>();
1307 
1308         // Attributes.
1309         attributes.add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,
1310             "Crawl job name", SimpleType.STRING, true, false, false));
1311         attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,
1312             "Short basic status message", SimpleType.STRING, true, false,
1313             false));
1314         attributes.add(
1315                 new OpenMBeanAttributeInfoSupport(FRONTIER_SHORT_REPORT_ATTR,
1316                 "Short frontier report", SimpleType.STRING, true,
1317                 false, false));
1318         attributes.add(
1319                 new OpenMBeanAttributeInfoSupport(THREADS_SHORT_REPORT_ATTR,
1320                 "Short threads report", SimpleType.STRING, true,
1321                 false, false));
1322         attributes.add(new OpenMBeanAttributeInfoSupport(UID_ATTR,
1323             "Crawl job UID", SimpleType.STRING, true, false, false));  
1324         attributes.add(new OpenMBeanAttributeInfoSupport(TOTAL_DATA_ATTR,
1325             "Total data received", SimpleType.LONG, true, false, false));
1326         attributes.add(new OpenMBeanAttributeInfoSupport(CRAWL_TIME_ATTR,
1327             "Crawl time", SimpleType.LONG, true, false, false));
1328         attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_DOC_RATE_ATTR,
1329             "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1330             true, false, false));
1331         attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_KB_RATE_ATTR,
1332             "Current crawling rate (Kb/sec)", SimpleType.LONG,
1333             true, false, false));
1334         attributes.add(new OpenMBeanAttributeInfoSupport(THREAD_COUNT_ATTR,
1335             "Active thread count", SimpleType.INTEGER, true, false, false));
1336         attributes.add(new OpenMBeanAttributeInfoSupport(DOC_RATE_ATTR,
1337             "Crawling rate (Docs/sec)", SimpleType.DOUBLE,
1338             true, false, false));
1339         attributes.add(new OpenMBeanAttributeInfoSupport(KB_RATE_ATTR,
1340             "Current crawling rate (Kb/sec)", SimpleType.LONG,
1341             true, false, false));
1342         attributes.add(new OpenMBeanAttributeInfoSupport(DOWNLOAD_COUNT_ATTR,
1343             "Count of downloaded documents", SimpleType.LONG,
1344             true, false, false));
1345         attributes.add(new OpenMBeanAttributeInfoSupport(DISCOVERED_COUNT_ATTR,
1346             "Count of discovered documents", SimpleType.LONG,
1347             true, false, false));
1348         
1349         // Add in the crawl order attributes.
1350         addCrawlOrderAttributes(this.getController().getOrder(), attributes);
1351         
1352         // Add the bdbje attributes.  Convert to open mbean attributes.
1353         // First do bdbeje setup.  Then add a subset of the bdbje attributes.
1354         // Keep around the list of names as a convenience for when it comes
1355         // time to test if attribute is supported.
1356         Environment env = this.controller.getBdbEnvironment();
1357         try {
1358             this.bdbjeMBeanHelper =
1359                 new JEMBeanHelper(env.getConfig(), env.getHome(), true);
1360         } catch (DatabaseException e) {
1361             e.printStackTrace();
1362             InitializationException ie =
1363                 new InitializationException(e.getMessage());
1364             ie.setStackTrace(e.getStackTrace());
1365             throw ie;
1366         }
1367         this.bdbjeAttributeNameList = Arrays.asList(new String [] {
1368                 JEMBeanHelper.ATT_ENV_HOME,
1369                 JEMBeanHelper.ATT_OPEN,
1370                 JEMBeanHelper.ATT_IS_READ_ONLY,
1371                 JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1372                 JEMBeanHelper.ATT_CACHE_SIZE,
1373                 JEMBeanHelper.ATT_CACHE_PERCENT,
1374                 JEMBeanHelper.ATT_LOCK_TIMEOUT,
1375                 JEMBeanHelper.ATT_IS_SERIALIZABLE,
1376                 JEMBeanHelper.ATT_SET_READ_ONLY,
1377         });
1378         addBdbjeAttributes(attributes,
1379                 this.bdbjeMBeanHelper.getAttributeList(env),
1380                 this.bdbjeAttributeNameList);
1381 
1382         // Operations.
1383         List<OpenMBeanOperationInfo> operations
1384          = new ArrayList<OpenMBeanOperationInfo>();
1385         OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[3];
1386         args[0] = new OpenMBeanParameterInfoSupport("url",
1387             "URL to add to the frontier", SimpleType.STRING);
1388         args[1] = new OpenMBeanParameterInfoSupport("forceFetch",
1389             "True if URL is to be force fetched", SimpleType.BOOLEAN);
1390         args[2] = new OpenMBeanParameterInfoSupport("seed",
1391             "True if URL is a seed", SimpleType.BOOLEAN);
1392         operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URI_OPER,
1393             "Add passed URL to the frontier", args, SimpleType.VOID,
1394                 MBeanOperationInfo.ACTION));
1395         
1396         args = new OpenMBeanParameterInfoSupport[4];
1397         args[0] = new OpenMBeanParameterInfoSupport("pathOrUrl",
1398             "Path or URL to file of URLs", SimpleType.STRING);
1399         args[1] = new OpenMBeanParameterInfoSupport("style",
1400             "Format format:default|crawlLog|recoveryJournal",
1401             SimpleType.STRING);
1402         args[2] = new OpenMBeanParameterInfoSupport("forceFetch",
1403             "True if URLs are to be force fetched", SimpleType.BOOLEAN);
1404         args[3] = new OpenMBeanParameterInfoSupport("seed",
1405             "True if all content are seeds.", SimpleType.BOOLEAN);
1406         operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URIS_OPER,
1407             "Add file of passed URLs to the frontier", args, SimpleType.STRING,
1408                 MBeanOperationInfo.ACTION));
1409         
1410         
1411         args = new OpenMBeanParameterInfoSupport[4];
1412         args[0] = new OpenMBeanParameterInfoSupport("filename",
1413                 "File to print to", SimpleType.STRING);
1414         args[1] = new OpenMBeanParameterInfoSupport("regexp",
1415                 "Regular expression URLs must match", SimpleType.STRING);
1416         args[2] = new OpenMBeanParameterInfoSupport("numberOfMatches",
1417                 "Maximum number of matches to return", SimpleType.INTEGER);
1418         args[3] = new OpenMBeanParameterInfoSupport("verbose",
1419                 "Should they be verbose descriptions", SimpleType.BOOLEAN);
1420         operations.add(new OpenMBeanOperationInfoSupport(DUMP_URIS_OPER,
1421                 "Dump pending URIs from frontier to a file", args,
1422                 SimpleType.VOID, MBeanOperationInfo.ACTION));
1423         
1424         operations.add(new OpenMBeanOperationInfoSupport(PAUSE_OPER,
1425             "Pause crawling (noop if already paused)", null, SimpleType.VOID,
1426             MBeanOperationInfo.ACTION));
1427         
1428         operations.add(new OpenMBeanOperationInfoSupport(RESUME_OPER,
1429             "Resume crawling (noop if already resumed)", null,
1430             SimpleType.VOID, MBeanOperationInfo.ACTION));
1431         
1432         args = new OpenMBeanParameterInfoSupport[1];
1433         args[0] = new OpenMBeanParameterInfoSupport("name",
1434             "Name of report ('all', 'standard', etc.).", SimpleType.STRING);
1435         operations.add(new OpenMBeanOperationInfoSupport(FRONTIER_REPORT_OPER,
1436              "Full frontier report", args, SimpleType.STRING,
1437              MBeanOperationInfo.INFO));
1438         
1439         operations.add(new OpenMBeanOperationInfoSupport(THREADS_REPORT_OPER,
1440              "Full thread report", null, SimpleType.STRING,
1441              MBeanOperationInfo.INFO));
1442         
1443         operations.add(new OpenMBeanOperationInfoSupport(SEEDS_REPORT_OPER,
1444              "Seeds report", null, SimpleType.STRING, MBeanOperationInfo.INFO));  
1445  
1446         operations.add(
1447                 new OpenMBeanOperationInfoSupport(PROGRESS_STATISTICS_OPER,
1448                 "Progress statistics at time of invocation", null,
1449                 SimpleType.STRING, MBeanOperationInfo.INFO)); 
1450         
1451         operations.add(new OpenMBeanOperationInfoSupport(
1452             PROGRESS_STATISTICS_LEGEND_OPER,
1453                 "Progress statistics legend", null,
1454                 SimpleType.STRING, MBeanOperationInfo.INFO));  
1455         
1456         operations.add(new OpenMBeanOperationInfoSupport(CHECKPOINT_OPER,
1457                 "Start a checkpoint", null, SimpleType.VOID,
1458                 MBeanOperationInfo.ACTION));
1459                 
1460         // Add bdbje operations. Add subset only. Keep around the list so have
1461         // it to hand when figuring what operations are supported. Usual actual
1462         // Strings because not accessible from JEMBeanHelper.
1463         this.bdbjeOperationsNameList = Arrays.asList(new String[] { "cleanLog",
1464                 "evictMemory", "checkpoint", "sync",
1465                 "getEnvironmentStatsToString", "getLockStatsToString",
1466                 "getDatabaseNames", OP_DB_STAT
1467         });
1468         addBdbjeOperations(operations,
1469                 this.bdbjeMBeanHelper.getOperationList(env),
1470                 this.bdbjeOperationsNameList);
1471         
1472         // Register notifications
1473         List<MBeanNotificationInfo> notifications
1474          = new ArrayList<MBeanNotificationInfo>();
1475         notifications.add(
1476             new MBeanNotificationInfo(new String [] {"crawlStarted",
1477                     "crawlEnding", "crawlPaused", "crawlResuming", PROG_STATS},
1478                 this.getClass().getName() + ".notifications",
1479                 "CrawlStatusListener events and progress statistics as " +
1480                     "notifications"));
1481         MBeanNotificationInfo [] notificationsArray =
1482             new MBeanNotificationInfo[notifications.size()];
1483         notifications.toArray(notificationsArray);
1484         
1485         // Build the info object.
1486         OpenMBeanAttributeInfoSupport[] attributesArray =
1487             new OpenMBeanAttributeInfoSupport[attributes.size()];
1488         attributes.toArray(attributesArray);
1489         OpenMBeanOperationInfoSupport[] operationsArray =
1490             new OpenMBeanOperationInfoSupport[operations.size()];
1491         operations.toArray(operationsArray);
1492         return new OpenMBeanInfoSupport(this.getClass().getName(),
1493             "Current Crawl Job as OpenMBean",
1494             attributesArray,
1495             new OpenMBeanConstructorInfoSupport [] {},
1496             operationsArray,
1497             notificationsArray);
1498     }
1499     
1500     protected void addBdbjeAttributes(
1501             final List<OpenMBeanAttributeInfo> attributes,
1502             final List<MBeanAttributeInfo> bdbjeAttributes, 
1503             final List<String> bdbjeNamesToAdd) {
1504         for (MBeanAttributeInfo info: bdbjeAttributes) {
1505             if (bdbjeNamesToAdd.contains(info.getName())) {
1506                 attributes.add(JmxUtils.convertToOpenMBeanAttribute(info));
1507             }
1508         }   
1509     }
1510     
1511     protected void addBdbjeOperations(
1512             final List<OpenMBeanOperationInfo> operations,
1513             final List<MBeanOperationInfo> bdbjeOperations, 
1514             final List<String> bdbjeNamesToAdd) {
1515         for (MBeanOperationInfo info: bdbjeOperations) {
1516             if (bdbjeNamesToAdd.contains(info.getName())) {
1517                 OpenMBeanOperationInfo omboi = null;
1518                 if (info.getName().equals(OP_DB_STAT)) {
1519                     // Db stats needs special handling. The published
1520                     // signature is wrong and its return type is awkward.
1521                     // Handle it.
1522                     omboi = JmxUtils.convertToOpenMBeanOperation(info, null,
1523                         SimpleType.STRING);
1524                     MBeanParameterInfo[] params = omboi.getSignature();
1525                     OpenMBeanParameterInfo[] args =
1526                         new OpenMBeanParameterInfoSupport[params.length + 1];
1527                     for (int ii = 0; ii < params.length; ii++) {
1528                         args[ii] = (OpenMBeanParameterInfo) params[ii];
1529                     }
1530                     args[params.length] = new OpenMBeanParameterInfoSupport(
1531                             "name", "Database name", SimpleType.STRING);
1532                     omboi = new OpenMBeanOperationInfoSupport(omboi.getName(),
1533                         omboi.getDescription(), args, omboi.getReturnOpenType(),
1534                         omboi.getImpact());
1535                 } else {
1536                     omboi = JmxUtils.convertToOpenMBeanOperation(info);
1537                 }
1538                 operations.add(omboi);
1539             }
1540         }
1541     }
1542     
1543     protected void addCrawlOrderAttributes(final ComplexType type,
1544             final List<OpenMBeanAttributeInfo> attributes) {
1545         for (final Iterator i = type.getAttributeInfoIterator(null);
1546                 i.hasNext();) {
1547             ModuleAttributeInfo info = (ModuleAttributeInfo)i.next();
1548             if (ORDER_EXCLUDE.contains(info.getName())) {
1549                 // Skip.
1550                 continue;
1551             }
1552             String absoluteName = type.getAbsoluteName() + "/" + info.getName();
1553             if (JmxUtils.isOpenType(info.getType())) {
1554                 String description = info.getDescription();
1555                 if (description == null || description.length() <= 0) {
1556                     // Description can't be empty.
1557                     description = info.getName();
1558                 }
1559                 attributes.add(new OpenMBeanAttributeInfoSupport(
1560                     absoluteName, description,
1561                     JmxUtils.getOpenType(info.getType()), true, true, false));
1562             } else if(info.isComplexType()) {
1563                 try {
1564                     ComplexType c =
1565                         (ComplexType)type.getAttribute(info.getName());
1566                     addCrawlOrderAttributes(c, attributes);
1567                 } catch (AttributeNotFoundException e) {
1568                     logger.log(Level.SEVERE, "Failed get of attribute", e);
1569                 } catch (MBeanException e) {
1570                     logger.log(Level.SEVERE, "Failed get of attribute", e);
1571                 } catch (ReflectionException e) {
1572                     logger.log(Level.SEVERE, "Failed get of attribute", e);
1573                 }
1574             } else if (info.getType().equals(TextField.class.getName())) {
1575                 // Special handling for TextField.  Use the STRING OpenType.
1576                 attributes.add(new OpenMBeanAttributeInfoSupport(
1577                         absoluteName, info.getDescription(),
1578                         SimpleType.STRING, true, true, false));
1579             } else {
1580                 // Looks like only type we don't currently handle is StringList.
1581                 // Figure how to do it.  Add as AttributeList?
1582                 logger.fine(info.getType());
1583             }
1584         }
1585     }
1586     
1587     public Object getAttribute(String attribute_name)
1588     throws AttributeNotFoundException {
1589         if (attribute_name == null) {
1590             throw new RuntimeOperationsException(
1591                  new IllegalArgumentException("Attribute name cannot be null"),
1592                  "Cannot call getAttribute with null attribute name");
1593         }
1594         
1595         // If no controller, we can't do any work in here.
1596         if (this.controller == null) {
1597             throw new RuntimeOperationsException(
1598                  new NullPointerException("Controller is null"),
1599                  "Controller is null");
1600         }
1601         
1602         // Is it a bdbje attribute?
1603         if (this.bdbjeAttributeNameList.contains(attribute_name)) {
1604             try {
1605                 return this.bdbjeMBeanHelper.getAttribute(
1606                         this.controller.getBdbEnvironment(), attribute_name);
1607             } catch (MBeanException e) {
1608                 throw new RuntimeOperationsException(new RuntimeException(e));
1609             }
1610         }
1611         
1612         // Is it a crawl-order attribute?
1613         if (attribute_name.
1614                 startsWith(this.controller.getOrder().getAbsoluteName())) {
1615             return getCrawlOrderAttribute(attribute_name);
1616         }
1617         
1618         if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1619             throw new AttributeNotFoundException("Attribute " +
1620                     attribute_name + " is unimplemented.");
1621         }
1622 
1623         // The pattern in the below is to match an attribute and when found
1624         // do a return out of if clause.  Doing it this way, I can fall
1625         // on to the AttributeNotFoundException for case where we've an
1626         // attribute but no handler.
1627         if (attribute_name.equals(STATUS_ATTR)) {
1628             return getCrawlStatus();
1629         }
1630         if (attribute_name.equals(NAME_ATTR)) {
1631             return getJobName();
1632         }
1633         if (attribute_name.equals(UID_ATTR)) {
1634             return getUID();
1635         }
1636         if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1637             return new Long(this.controller == null &&
1638                     this.controller.getStatistics() != null? 0:
1639                 this.controller.getStatistics().totalBytesWritten());
1640         }
1641         if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1642             return new Long(this.controller == null &&
1643                     this.controller.getStatistics() != null? 0:
1644                 this.controller.getStatistics().getCrawlerTotalElapsedTime() /
1645                     1000);
1646         }
1647         if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1648             return new Double(this.controller == null &&
1649                     this.controller.getStatistics() != null? 0:
1650                 this.controller.getStatistics().currentProcessedDocsPerSec());
1651         }
1652         if (attribute_name.equals(DOC_RATE_ATTR)) {
1653             return new Double(this.controller == null &&
1654                     this.controller.getStatistics() != null? 0:
1655                 this.controller.getStatistics().processedDocsPerSec());
1656         }
1657         if (attribute_name.equals(KB_RATE_ATTR)) {
1658             return new Long(this.controller == null &&
1659                     this.controller.getStatistics() != null? 0:
1660                 this.controller.getStatistics().currentProcessedKBPerSec());
1661         }
1662         if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1663             return new Long(this.controller == null &&
1664                     this.controller.getStatistics() != null? 0:
1665                 this.controller.getStatistics().processedKBPerSec());
1666         }
1667         if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1668             return new Integer(this.controller == null &&
1669                     this.controller.getStatistics() != null? 0:
1670                 this.controller.getStatistics().activeThreadCount());
1671         }       
1672         if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1673             return getFrontierOneLine();
1674         }
1675         if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1676             return getThreadOneLine();
1677         }
1678         if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1679             return new Long(this.controller == null &&
1680                     this.controller.getStatistics() != null? 0:
1681                 this.controller.getStatistics().totalCount());
1682         }
1683         if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1684             return new Long(this.controller == null &&
1685                     this.controller.getStatistics() != null? 0:
1686                 this.controller.getStatistics().successfullyFetchedCount());
1687         }
1688         
1689         throw new AttributeNotFoundException("Attribute " +
1690             attribute_name + " not found.");
1691     }
1692     
1693     protected Object getCrawlOrderAttribute(final String attribute_name) {
1694         CrawlOrder order = this.getController().getOrder();
1695         Object result = null;
1696         try {
1697             result = getCrawlOrderAttribute(attribute_name.substring(order
1698                     .getAbsoluteName().length()), order);
1699         } catch (NullPointerException e) {
1700             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1701         } catch (AttributeNotFoundException e) {
1702             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1703         } catch (MBeanException e) {
1704             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1705         } catch (ReflectionException e) {
1706             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1707         }
1708         return result;
1709     }
1710 
1711     protected Object getCrawlOrderAttribute(final String attribute_name,
1712             final ComplexType ct)
1713     throws AttributeNotFoundException, MBeanException, ReflectionException {
1714         String subName = attribute_name.startsWith("/") ? attribute_name
1715                 .substring(1) : attribute_name;
1716         int index = subName.indexOf("/");
1717         if (index <= 0) {
1718             MBeanAttributeInfo info = ct.getAttributeInfo(subName);
1719             // Special handling for TextField.
1720             return info.getType().equals(TextField.class.getName()) ? ct
1721                     .getAttribute(subName).toString() : ct
1722                     .getAttribute(subName);
1723         }
1724         return getCrawlOrderAttribute(subName.substring(index + 1),
1725                 (ComplexType) ct.getAttribute(subName.substring(0, index)));
1726     }
1727     
1728     public AttributeList getAttributes(String [] attributeNames) {
1729         if (attributeNames == null) {
1730             throw new RuntimeOperationsException(
1731                 new IllegalArgumentException("attributeNames[] cannot be " +
1732                 "null"), "Cannot call getAttributes with null attribute " +
1733                 "names");
1734         }
1735         
1736         // If no controller, we can't do any work in here.
1737         if (this.controller == null) {
1738             throw new RuntimeOperationsException(
1739                  new NullPointerException("Controller is null"),
1740                  "Controller is null");
1741         }
1742         
1743         AttributeList resultList = new AttributeList();
1744         if (attributeNames.length == 0) {
1745             return resultList;
1746         }
1747         for (int i = 0; i < attributeNames.length; i++) {
1748             try {
1749                 Object value = getAttribute(attributeNames[i]);
1750                 resultList.add(new Attribute(attributeNames[i], value));
1751             } catch (Exception e) {
1752                 e.printStackTrace();
1753             }
1754         }
1755         return(resultList);
1756     }
1757 
1758     public void setAttribute(Attribute attribute)
1759             throws AttributeNotFoundException {
1760         // Is it a crawl order attribute?
1761         CrawlOrder order = this.getController().getOrder();
1762         String attName = attribute.getName();
1763         if (attName.startsWith(order.getAbsoluteName())) {
1764             try {
1765                 setCrawlOrderAttribute(attribute.getName().substring(
1766                         order.getAbsoluteName().length()), order, attribute);
1767             } catch (NullPointerException e) {
1768                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1769             } catch (AttributeNotFoundException e) {
1770                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1771             } catch (MBeanException e) {
1772                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1773             } catch (ReflectionException e) {
1774                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1775             } catch (InvalidAttributeValueException e) {
1776                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1777             }
1778             return;
1779         }
1780         
1781         // Is it a bdbje attribute?
1782         if (this.bdbjeAttributeNameList.contains(attName)) {
1783             try {
1784                 this.bdbjeMBeanHelper.setAttribute(this.controller
1785                         .getBdbEnvironment(), attribute);
1786             } catch (AttributeNotFoundException e) {
1787                 throw new RuntimeOperationsException(new RuntimeException(e));
1788             } catch (InvalidAttributeValueException e) {
1789                 throw new RuntimeOperationsException(new RuntimeException(e));
1790             }
1791             return;
1792         }
1793         
1794         // Else, we don't know how to handle this attribute.
1795         throw new AttributeNotFoundException("Attribute " + attName +
1796             " can not be set.");
1797     }
1798     
1799     protected void setCrawlOrderAttribute(final String attribute_name,
1800             final ComplexType ct, final Attribute attribute)
1801     throws AttributeNotFoundException, InvalidAttributeValueException,
1802             MBeanException, ReflectionException {
1803         String subName = attribute_name.startsWith("/") ? attribute_name
1804                 .substring(1) : attribute_name;
1805         int index = subName.indexOf("/");
1806         if (index <= 0) {
1807             ct.setAttribute(new Attribute(subName, attribute.getValue()));
1808             return;
1809         }
1810         setCrawlOrderAttribute(subName.substring(index + 1), (ComplexType) ct
1811                 .getAttribute(subName.substring(0, index)), attribute);
1812     }
1813 
1814     public AttributeList setAttributes(AttributeList attributes) {
1815         if (attributes == null) {
1816             throw new RuntimeOperationsException(
1817                 new IllegalArgumentException("attributeNames[] cannot be " +
1818                 "null"), "Cannot call getAttributes with null attribute " +
1819                 "names");
1820         }
1821         
1822         AttributeList resultList = new AttributeList();
1823         if (attributes.size() == 0) {
1824             return resultList;
1825         }
1826         for (int i = 0; i < attributes.size(); i++) {
1827             try {
1828                 Attribute attr = (Attribute)attributes.get(i);
1829                 setAttribute(attr);
1830                 String an = attr.getName();
1831                 Object newValue = getAttribute(an);
1832                 resultList.add(new Attribute(an, newValue));
1833             } catch (Exception e) {
1834                 e.printStackTrace();
1835             }
1836         }
1837         return resultList;
1838     }
1839 
1840     public Object invoke(String operationName, Object[] params,
1841         String[] signature)
1842     throws ReflectionException {
1843         if (operationName == null) {
1844             throw new RuntimeOperationsException(
1845                 new IllegalArgumentException("Operation name cannot be null"),
1846                 "Cannot call invoke with null operation name");
1847         }
1848         
1849         controller.installThreadContextSettingsHandler();
1850         
1851         if (this.bdbjeOperationsNameList.contains(operationName)) {
1852             try {
1853                 Object o = this.bdbjeMBeanHelper.invoke(
1854                         this.controller.getBdbEnvironment(),
1855                         operationName, params, signature);
1856                 // If OP_DB_ST, return String version of result.
1857                 if (operationName.equals(OP_DB_STAT)) {
1858                     return o.toString();
1859                 }
1860                 return o;
1861             } catch (MBeanException e) {
1862                 throw new RuntimeOperationsException(new RuntimeException(e));
1863             }
1864         }
1865         
1866         // TODO: Exploit passed signature.
1867         
1868         // The pattern in the below is to match an operation and when found
1869         // do a return out of if clause.  Doing it this way, I can fall
1870         // on to the MethodNotFoundException for case where we've an
1871         // attribute but no handler.
1872         if (operationName.equals(IMPORT_URI_OPER)) {
1873             JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1874             mustBeCrawling();
1875             try {
1876                 importUri((String)params[0],
1877                     ((Boolean)params[1]).booleanValue(),
1878                     ((Boolean)params[2]).booleanValue());
1879             } catch (URIException e) {
1880                 throw new RuntimeOperationsException(new RuntimeException(e));
1881             }
1882             return null;
1883         }
1884         
1885         if (operationName.equals(IMPORT_URIS_OPER)) {
1886             JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1887             mustBeCrawling();
1888             return importUris((String)params[0],
1889                 ((String)params[1]).toString(),
1890                 ((Boolean)params[2]).booleanValue(),
1891                 ((Boolean)params[3]).booleanValue());
1892         }
1893         
1894         if (operationName.equals(DUMP_URIS_OPER)) {
1895             JmxUtils.checkParamsCount(DUMP_URIS_OPER, params, 4);
1896             mustBeCrawling();
1897             if (!this.controller.isPaused()) {
1898                 throw new RuntimeOperationsException(
1899                         new IllegalArgumentException("Must " + "be paused"),
1900                         "Cannot dump URI's from running job.");
1901             }
1902             dumpUris((String) params[0], (String) params[1],
1903                     ((Integer) params[2]).intValue(), ((Boolean) params[3])
1904                             .booleanValue());
1905         }
1906         
1907         if (operationName.equals(PAUSE_OPER)) {
1908             JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1909             mustBeCrawling();
1910             pause();
1911             return null;
1912         }
1913         
1914         if (operationName.equals(RESUME_OPER)) {
1915             JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1916             mustBeCrawling();
1917             resume();
1918             return null;
1919         }
1920         
1921         if (operationName.equals(FRONTIER_REPORT_OPER)) {
1922             JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1923             mustBeCrawling();
1924             return getFrontierReport((String)params[0]);
1925         }
1926         
1927         if (operationName.equals(THREADS_REPORT_OPER)) {
1928             JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1929             mustBeCrawling();
1930             return getThreadsReport();
1931         }
1932         
1933         if (operationName.equals(SEEDS_REPORT_OPER)) {
1934             JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1935             mustBeCrawling();
1936             StringWriter sw = new StringWriter();
1937             if (getStatisticsTracking() != null &&
1938                     getStatisticsTracking() instanceof StatisticsTracker) {
1939                 ((StatisticsTracker)getStatisticsTracking()).
1940                     writeSeedsReportTo(new PrintWriter(sw));
1941             } else {
1942                 sw.write("Unsupported");
1943             }
1944             return sw.toString();
1945         }       
1946         
1947         if (operationName.equals(CHECKPOINT_OPER)) {
1948             JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1949             mustBeCrawling();
1950             try {
1951                 checkpoint();
1952             } catch (IllegalStateException e) {
1953                 throw new RuntimeOperationsException(e);
1954             }
1955             return null;
1956         }
1957         
1958         if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1959             JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params, 0);
1960             mustBeCrawling();
1961             return getStatisticsTracking().getProgressStatisticsLine();
1962         }
1963         
1964         if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1965             JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1966                     params, 0);
1967             return getStatisticsTracking().progressStatisticsLegend();
1968         }
1969         
1970         throw new ReflectionException(
1971             new NoSuchMethodException(operationName),
1972                 "Cannot find the operation " + operationName);
1973     }
1974     
1975     public void mustBeCrawling() {
1976         if (!isCrawling()) {
1977             throw new RuntimeOperationsException(
1978                 new IllegalArgumentException("Not " +
1979                 "crawling (Shouldn't ever be the case)"),
1980                 "Not current crawling job?");
1981         }
1982     }
1983     
1984     public boolean isCrawling() {
1985         return this.controller != null;
1986     }
1987     
1988     /***
1989      * Utility method to get the stored list of ignored seed items (if any),
1990      * from the last time the seeds were imported to the frontier.
1991      * 
1992      * @return String of all ignored seed items, or null if none
1993      */
1994     public String getIgnoredSeeds() {
1995         File ignoredFile = new File(getDirectory(),
1996                 AbstractFrontier.IGNORED_SEEDS_FILENAME);
1997         if(!ignoredFile.exists()) {
1998             return null;
1999         }
2000         try {
2001             return FileUtils.readFileAsString(ignoredFile);
2002         } catch (IOException e) {
2003             // TODO Auto-generated catch block
2004             e.printStackTrace();
2005             return null;
2006         }
2007     }
2008     
2009     /***
2010      * Forward a 'kick' update to current controller if any.
2011      * @see CrawlController#kickUpdate()
2012      */
2013     public void kickUpdate(){
2014         if (this.controller != null){
2015             this.controller.kickUpdate();
2016         }
2017     }
2018     
2019     /***
2020      * Returns a URIFrontierMarker for the current, paused, job. If there is no
2021      * current job or it is not paused null will be returned.
2022      *
2023      * @param regexpr A regular expression that each URI must match in order to
2024      * be considered 'within' the marker.
2025      * @param inCacheOnly Limit marker scope to 'cached' URIs.
2026      * @return a URIFrontierMarker for the current job.
2027      * @see #getPendingURIsList(FrontierMarker, int, boolean)
2028      * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
2029      *      boolean)
2030      * @see org.archive.crawler.framework.FrontierMarker
2031      */
2032     public FrontierMarker getInitialMarker(String regexpr,
2033             boolean inCacheOnly) {
2034         return (this.controller != null && this.controller.isPaused())?
2035            this.controller.getFrontier().getInitialMarker(regexpr, inCacheOnly):
2036                null;
2037     }
2038     
2039     /***
2040      * Returns the frontiers URI list based on the provided marker. This method
2041      * will return null if there is not current job or if the current job is
2042      * not paused. Only when there is a paused current job will this method
2043      * return a URI list.
2044      *
2045      * @param marker URIFrontier marker
2046      * @param numberOfMatches Maximum number of matches to return
2047      * @param verbose Should detailed info be provided on each URI?
2048      * @return the frontiers URI list based on the provided marker
2049      * @throws InvalidFrontierMarkerException
2050      *             When marker is inconsistent with the current state of the
2051      *             frontier.
2052      * @see #getInitialMarker(String, boolean)
2053      * @see org.archive.crawler.framework.FrontierMarker
2054      */
2055     public ArrayList<String> getPendingURIsList(FrontierMarker marker,
2056             int numberOfMatches, boolean verbose)
2057     throws InvalidFrontierMarkerException {
2058         return  (this.controller != null && this.controller.isPaused())?
2059             this.controller.getFrontier().getURIsList(marker, numberOfMatches,
2060                     verbose):
2061             null;
2062     }
2063 
2064     public void dumpUris(String filename, String regexp, int numberOfMatches,
2065             boolean verbose) {
2066         try {
2067             PrintWriter out = new PrintWriter(filename); 
2068             FrontierMarker marker = 
2069                 controller.getFrontier().getInitialMarker(regexp, false);
2070             int matchesDumped = 0;
2071             
2072             while(matchesDumped<numberOfMatches) {
2073                 int batchMatches = Math.min(100, numberOfMatches-matchesDumped);
2074                 
2075                 ArrayList<String> batchOfUris = 
2076                     getPendingURIsList(marker,batchMatches,false);
2077                 for(String uriLine : batchOfUris) {
2078                     out.write(uriLine);
2079                     out.write("\n");
2080                     matchesDumped++;
2081                 }
2082                 if (batchOfUris.size()<batchMatches) {
2083                     // must be exhausted; we're finished
2084                     break; 
2085                 }
2086             }
2087             IOUtils.closeQuietly(out); 
2088         } catch (FileNotFoundException e) {
2089             logger.log(Level.SEVERE, "Failed dumpUris write", e);
2090         } catch (InvalidFrontierMarkerException e) {
2091             logger.log(Level.SEVERE, "Failed dumpUris", e);
2092         }
2093     }
2094     
2095     public void crawlStarted(String message) {
2096         if (this.mbeanName != null) {
2097             // Can be null around job startup.
2098             sendNotification(new Notification("crawlStarted",
2099                 this.mbeanName,  getNotificationsSequenceNumber(), message)); 
2100         }
2101     }
2102 
2103     public void crawlEnding(String sExitMessage) {
2104         setRunning(false);
2105         setStatus(sExitMessage);
2106         setReadOnly();
2107         if (this.mbeanName != null) {
2108             sendNotification(new Notification("crawlEnding", this.mbeanName,
2109                 getNotificationsSequenceNumber(), sExitMessage));
2110         }
2111     }
2112 
2113     public void crawlEnded(String sExitMessage) {
2114         // Let the settings handler be cleaned up by the crawl controller
2115         // completeStop. Just let go of our reference in here.
2116         // if (this.settingsHandler != null) {
2117         //    this.settingsHandler.cleanup();
2118         // }
2119         
2120         // We used to zero-out datamembers but no longer needed now CrawlJobs
2121         // no longer persist after completion (They used to be kept around in
2122         // a list so operator could view CrawlJob finish state and reports --
2123         // but we now dump actual job and create a new uninitialized CrawlJob
2124         // that points at old CrawlJob data. 
2125     }
2126 
2127     public void crawlPausing(String statusMessage) {
2128         setStatus(statusMessage);
2129     }
2130 
2131     public void crawlPaused(String statusMessage) {
2132         setStatus(statusMessage);
2133         if (this.mbeanName != null) {
2134             // Can be null around job startup.
2135             sendNotification(new Notification("crawlPaused", this.mbeanName,
2136                 getNotificationsSequenceNumber(), statusMessage));
2137         }
2138     }
2139 
2140     public void crawlResuming(String statusMessage) {
2141         setStatus(statusMessage);
2142         if (this.mbeanName != null) {
2143             // Can be null around job startup.
2144             sendNotification(new Notification("crawlResuming", this.mbeanName,
2145                 getNotificationsSequenceNumber(), statusMessage));
2146         }
2147     }
2148 
2149     public void crawlCheckpoint(File checkpointDir) throws Exception {
2150         setStatus(CrawlJob.STATUS_CHECKPOINTING);
2151     }
2152 
2153     public CrawlController getController() {
2154         return this.controller;
2155     }
2156     
2157     public ObjectName preRegister(final MBeanServer server, ObjectName on)
2158     throws Exception {
2159         this.mbeanServer = server;
2160         @SuppressWarnings("unchecked")
2161         Hashtable<String,String> ht = on.getKeyPropertyList();
2162         if (!ht.containsKey(JmxUtils.NAME)) {
2163             throw new IllegalArgumentException("Name property required" +
2164                 on.getCanonicalName());
2165         }
2166         // Now append key/values from hosting heritrix JMX ObjectName so it can be
2167         // found just by examination of the CrawlJob JMX ObjectName.  Add heritrix
2168         // name attribute as 'mother' attribute.
2169         Heritrix h = getHostingHeritrix();
2170         if (h == null || h.getMBeanName() == null) {
2171             throw new IllegalArgumentException("Hosting heritrix not found " +
2172                 "or not registered with JMX: " + on.getCanonicalName());
2173         }
2174         @SuppressWarnings("unchecked")
2175         Map<String,String> hht = h.getMBeanName().getKeyPropertyList();
2176         ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2177         String port = hht.get(JmxUtils.JMX_PORT);
2178         if (port != null) {
2179         	ht.put(JmxUtils.JMX_PORT, port);
2180         }
2181         ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2182         if (!ht.containsKey(JmxUtils.TYPE)) {
2183             ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2184         }
2185         this.mbeanName = new ObjectName(on.getDomain(), ht);
2186         return this.mbeanName;
2187     }
2188 
2189     public void postRegister(Boolean registrationDone) {
2190         if (logger.isLoggable(Level.INFO)) {
2191             logger.info(
2192                 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2193                 this.mbeanServer, registrationDone.booleanValue()));
2194         }
2195     }
2196 
2197     public void preDeregister() throws Exception {
2198         // Nothing to do.
2199     }
2200 
2201     public void postDeregister() {
2202         if (mbeanName ==  null) {
2203             return;
2204         }
2205         if (logger.isLoggable(Level.INFO)) {
2206             logger.info(JmxUtils.getLogUnregistrationMsg(
2207                     this.mbeanName.getCanonicalName(), this.mbeanServer));
2208         }
2209         this.mbeanName = null;
2210     }
2211     
2212     /***
2213      * @return Heritrix that is hosting this job.
2214      */
2215     protected Heritrix getHostingHeritrix() {
2216         Heritrix hostingHeritrix = null;
2217         Map heritrice = Heritrix.getInstances();
2218         for (final Iterator i = heritrice.keySet().iterator(); i.hasNext();) {
2219             Heritrix h = (Heritrix)heritrice.get(i.next());
2220             if (h.getJobHandler().getCurrentJob() == this) {
2221                 hostingHeritrix = h;
2222                 break;
2223             }
2224         }
2225         return hostingHeritrix;
2226     }
2227     
2228     /***
2229      * @return Unique name for job that is safe to use in jmx (Like display
2230      * name but without spaces).
2231      */
2232     public String getJmxJobName() {
2233         return getJobName() + "-" + getUID();
2234     }
2235 
2236     /***
2237      * @return Notification sequence number (Does increment after each access).
2238      */
2239     protected static int getNotificationsSequenceNumber() {
2240         return notificationsSequenceNumber++;
2241     }
2242 
2243     protected ObjectName getMbeanName() {
2244         return this.mbeanName;
2245     }
2246     
2247     /***
2248      * @return the statistics tracking instance (of null if none yet available).
2249      */
2250     public StatisticsTracking getStatisticsTracking() {
2251         return this.controller == null ||
2252             this.controller.getStatistics() == null? null:
2253                 this.controller.getStatistics();
2254     }
2255 }