1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.archive.crawler.admin;
22
23 import java.io.BufferedReader;
24 import java.io.File;
25 import java.io.FileNotFoundException;
26 import java.io.FileReader;
27 import java.io.FileWriter;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.InputStreamReader;
31 import java.io.PrintWriter;
32 import java.io.Serializable;
33 import java.io.StringWriter;
34 import java.util.ArrayList;
35 import java.util.Arrays;
36 import java.util.Collection;
37 import java.util.EventObject;
38 import java.util.Hashtable;
39 import java.util.Iterator;
40 import java.util.List;
41 import java.util.Map;
42 import java.util.logging.Level;
43 import java.util.logging.Logger;
44
45 import javax.management.Attribute;
46 import javax.management.AttributeList;
47 import javax.management.AttributeNotFoundException;
48 import javax.management.DynamicMBean;
49 import javax.management.InstanceAlreadyExistsException;
50 import javax.management.InvalidAttributeValueException;
51 import javax.management.MBeanAttributeInfo;
52 import javax.management.MBeanException;
53 import javax.management.MBeanInfo;
54 import javax.management.MBeanNotificationInfo;
55 import javax.management.MBeanOperationInfo;
56 import javax.management.MBeanParameterInfo;
57 import javax.management.MBeanRegistration;
58 import javax.management.MBeanRegistrationException;
59 import javax.management.MBeanServer;
60 import javax.management.NotCompliantMBeanException;
61 import javax.management.Notification;
62 import javax.management.NotificationBroadcasterSupport;
63 import javax.management.ObjectName;
64 import javax.management.ReflectionException;
65 import javax.management.RuntimeOperationsException;
66 import javax.management.openmbean.CompositeData;
67 import javax.management.openmbean.CompositeDataSupport;
68 import javax.management.openmbean.CompositeType;
69 import javax.management.openmbean.OpenDataException;
70 import javax.management.openmbean.OpenMBeanAttributeInfo;
71 import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
72 import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
73 import javax.management.openmbean.OpenMBeanInfoSupport;
74 import javax.management.openmbean.OpenMBeanOperationInfo;
75 import javax.management.openmbean.OpenMBeanOperationInfoSupport;
76 import javax.management.openmbean.OpenMBeanParameterInfo;
77 import javax.management.openmbean.OpenMBeanParameterInfoSupport;
78 import javax.management.openmbean.SimpleType;
79
80 import org.apache.commons.httpclient.URIException;
81 import org.apache.commons.io.IOUtils;
82 import org.archive.crawler.Heritrix;
83 import org.archive.crawler.datamodel.CandidateURI;
84 import org.archive.crawler.datamodel.Checkpoint;
85 import org.archive.crawler.datamodel.CrawlOrder;
86 import org.archive.crawler.event.CrawlStatusListener;
87 import org.archive.crawler.framework.CrawlController;
88 import org.archive.crawler.framework.FrontierMarker;
89 import org.archive.crawler.framework.StatisticsTracking;
90 import org.archive.crawler.framework.exceptions.InitializationException;
91 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
92 import org.archive.crawler.frontier.AbstractFrontier;
93 import org.archive.crawler.settings.ComplexType;
94 import org.archive.crawler.settings.ModuleAttributeInfo;
95 import org.archive.crawler.settings.TextField;
96 import org.archive.crawler.settings.XMLSettingsHandler;
97 import org.archive.crawler.util.CheckpointUtils;
98 import org.archive.crawler.util.IoUtils;
99 import org.archive.util.ArchiveUtils;
100 import org.archive.util.FileUtils;
101 import org.archive.util.JEMBeanHelper;
102 import org.archive.util.JmxUtils;
103 import org.archive.util.iterator.LineReadingIterator;
104 import org.archive.util.iterator.RegexpLineIterator;
105
106 import com.sleepycat.je.DatabaseException;
107 import com.sleepycat.je.Environment;
108
109 /***
110 * A CrawlJob encapsulates a 'crawl order' with any and all information and
111 * methods needed by a CrawlJobHandler to accept and execute them.
112 *
113 * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
114 * should not be executed as a crawl but can be edited and used as a template
115 * for creating new CrawlJobs.
116 *
117 * <p>All of it's constructors are protected since only a CrawlJobHander
118 * should construct new CrawlJobs.
119 *
120 * @author Kristinn Sigurdsson
121 *
122 * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
123 * String, String, String, int)
124 * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
125 * String, String, String)
126 */
127
128 public class CrawlJob extends NotificationBroadcasterSupport
129 implements DynamicMBean, MBeanRegistration, CrawlStatusListener, Serializable {
130 /***
131 * Eclipse generated serial number.
132 */
133 private static final long serialVersionUID = 3411161000452525856L;
134
135 private static final Logger logger =
136 Logger.getLogger(CrawlJob.class.getName());
137
138
139
140 /*** lowest */
141 public static final int PRIORITY_MINIMAL = 0;
142 /*** low */
143 public static final int PRIORITY_LOW = 1;
144 /*** average */
145 public static final int PRIORITY_AVERAGE = 2;
146 /*** high */
147 public static final int PRIORITY_HIGH = 3;
148 /*** highest */
149 public static final int PRIORITY_CRITICAL = 4;
150
151
152
153
154 /*** Inital value. May not be ready to run/incomplete. */
155 public static final String STATUS_CREATED = "Created";
156 /*** Job has been successfully submitted to a CrawlJobHandler */
157 public static final String STATUS_PENDING = "Pending";
158 /*** Job is being crawled */
159 public static final String STATUS_RUNNING = "Running";
160 /*** Job was deleted by user, will not be displayed in UI. */
161 public static final String STATUS_DELETED = "Deleted";
162 /*** Job was terminted by user input while crawling */
163 public static final String STATUS_ABORTED = "Finished - Ended by operator";
164 /*** Something went very wrong */
165 public static final String STATUS_FINISHED_ABNORMAL =
166 "Finished - Abnormal exit from crawling";
167 /*** Job finished normally having completed its crawl. */
168 public static final String STATUS_FINISHED = "Finished";
169 /*** Job finished normally when the specified timelimit was hit. */
170 public static final String STATUS_FINISHED_TIME_LIMIT =
171 "Finished - Timelimit hit";
172 /*** Job finished normally when the specifed amount of
173 * data (MB) had been downloaded */
174 public static final String STATUS_FINISHED_DATA_LIMIT =
175 "Finished - Maximum amount of data limit hit";
176 /*** Job finished normally when the specified number of documents had been
177 * fetched.
178 */
179 public static final String STATUS_FINISHED_DOCUMENT_LIMIT =
180 "Finished - Maximum number of documents limit hit";
181 /*** Job is going to be temporarly stopped after active threads are finished. */
182 public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " +
183 "Waiting for threads to finish";
184 /*** Job was temporarly stopped. State is kept so it can be resumed */
185 public static final String STATUS_PAUSED = "Paused";
186 /***
187 * Job is being checkpointed. When finished checkpointing, job is set
188 * back to STATUS_PAUSED (Job must be first paused before checkpointing
189 * will run).
190 */
191 public static final String STATUS_CHECKPOINTING = "Checkpointing";
192 /*** Job could not be launced due to an InitializationException */
193 public static final String STATUS_MISCONFIGURED = "Could not launch job " +
194 "- Fatal InitializationException";
195 /*** Job is actually a profile */
196 public static final String STATUS_PROFILE = "Profile";
197
198 public static final String STATUS_PREPARING = "Preparing";
199
200
201 private String UID;
202 private String name;
203 private String status;
204 private boolean isReadOnly = false;
205 private boolean isNew = true;
206 private boolean isProfile = false;
207 private boolean isRunning = false;
208 private int priority;
209 private int numberOfJournalEntries = 0;
210
211 private String statisticsFileSave = "";
212
213 private String errorMessage = null;
214
215 private File jobDir = null;
216
217 private transient CrawlJobErrorHandler errorHandler = null;
218
219 protected transient XMLSettingsHandler settingsHandler;
220
221 private transient CrawlController controller = null;
222
223 private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";
224 private static final String CRAWL_LOG_STYLE = "crawlLog";
225
226
227
228 /***
229 * Server we registered with. Maybe null.
230 */
231 private transient MBeanServer mbeanServer = null;
232 private transient ObjectName mbeanName = null;
233 private static final String CRAWLJOB_JMXMBEAN_TYPE =
234 JmxUtils.SERVICE + ".Job";
235 private transient JEMBeanHelper bdbjeMBeanHelper = null;
236 private transient List<String> bdbjeAttributeNameList = null;
237 private transient List<String> bdbjeOperationsNameList = null;
238
239
240 /***
241 * The MBean we've registered ourselves with (May be null
242 * throughout life of Heritrix).
243 */
244 private transient OpenMBeanInfoSupport openMBeanInfo;
245
246 private final static String NAME_ATTR = "Name";
247 private final static String UID_ATTR = "UID";
248 private final static String STATUS_ATTR = "Status";
249 private final static String FRONTIER_SHORT_REPORT_ATTR =
250 "FrontierShortReport";
251 private final static String THREADS_SHORT_REPORT_ATTR =
252 "ThreadsShortReport";
253 private final static String TOTAL_DATA_ATTR = "TotalData";
254 private final static String CRAWL_TIME_ATTR = "CrawlTime";
255 private final static String DOC_RATE_ATTR = "DocRate";
256 private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
257 private final static String KB_RATE_ATTR = "KbRate";
258 private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate";
259 private final static String THREAD_COUNT_ATTR = "ThreadCount";
260 private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount";
261 private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount";
262 private final static String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR,
263 STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR,
264 TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR,
265 CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR,
266 THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR};
267 private final static List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY);
268
269 private final static String IMPORT_URI_OPER = "importUri";
270 private final static String IMPORT_URIS_OPER = "importUris";
271 private final static String DUMP_URIS_OPER = "dumpUris";
272 private final static String PAUSE_OPER = "pause";
273 private final static String RESUME_OPER = "resume";
274 private final static String FRONTIER_REPORT_OPER = "frontierReport";
275 private final static String THREADS_REPORT_OPER = "threadsReport";
276 private final static String SEEDS_REPORT_OPER = "seedsReport";
277 private final static String CHECKPOINT_OPER = "startCheckpoint";
278 private final static String PROGRESS_STATISTICS_OPER =
279 "progressStatistics";
280 private final static String PROGRESS_STATISTICS_LEGEND_OPER =
281 "progressStatisticsLegend";
282
283 private final static String PROG_STATS = "progressStatistics";
284
285
286 private final static String OP_DB_STAT = "getDatabaseStats";
287
288 /***
289 * Don't add the following crawl-order items.
290 */
291 private final static List ORDER_EXCLUDE;
292 static {
293 ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent",
294 "extract-processors", "DNS", "uri-included-structure"});
295 }
296
297 /***
298 * Sequence number for jmx notifications.
299 */
300 private static int notificationsSequenceNumber = 1;
301
302 /***
303 * A shutdown Constructor.
304 */
305 protected CrawlJob() {
306 super();
307 }
308
309 /***
310 * A constructor for jobs.
311 *
312 * <p> Create, ready to crawl, jobs.
313 * @param UID A unique ID for this job. Typically emitted by the
314 * CrawlJobHandler.
315 * @param name The name of the job
316 * @param settingsHandler The associated settings
317 * @param errorHandler The crawl jobs settings error handler.
318 * <tt>null</tt> means none is set
319 * @param priority job priority.
320 * @param dir The directory that is considered this jobs working directory.
321 */
322 public CrawlJob(final String UID,
323 final String name, final XMLSettingsHandler settingsHandler,
324 final CrawlJobErrorHandler errorHandler, final int priority,
325 final File dir) {
326 this(UID, name, settingsHandler, errorHandler,
327 priority, dir, null, false, true);
328 }
329
330 /***
331 * A constructor for profiles.
332 *
333 * <p> Any job created with this constructor will be
334 * considered a profile. Profiles are not stored on disk (only their
335 * settings files are stored on disk). This is because their data is
336 * predictible given any settings files.
337 * @param UIDandName A unique ID for this job. For profiles this is the same
338 * as name
339 * @param settingsHandler The associated settings
340 * @param errorHandler The crawl jobs settings error handler.
341 * <tt>null</tt> means none is set
342 */
343 protected CrawlJob(final String UIDandName,
344 final XMLSettingsHandler settingsHandler,
345 final CrawlJobErrorHandler errorHandler) {
346 this(UIDandName, UIDandName, settingsHandler, errorHandler,
347 PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
348 }
349
350 public CrawlJob(final String UID,
351 final String name, final XMLSettingsHandler settingsHandler,
352 final CrawlJobErrorHandler errorHandler, final int priority,
353 final File dir, final String status, final boolean isProfile,
354 final boolean isNew) {
355 super();
356 this.UID = UID;
357 this.name = name;
358 this.settingsHandler = settingsHandler;
359 this.errorHandler = errorHandler;
360 this.status = status;
361 this.isProfile = isProfile;
362 this.isNew = isNew;
363 this.jobDir = dir;
364 this.priority = priority;
365 }
366
367 /***
368 * A constructor for reloading jobs from disk. Jobs (not profiles) have
369 * their data written to persistent storage in the file system. This method
370 * is used to load the job from such storage. This is done by the
371 * <code>CrawlJobHandler</code>.
372 * <p>
373 * Proper structure of a job file (TODO: Maybe one day make this an XML file)
374 * Line 1. UID <br>
375 * Line 2. Job name (string) <br>
376 * Line 3. Job status (string) <br>
377 * Line 4. is job read only (true/false) <br>
378 * Line 5. is job running (true/false) <br>
379 * Line 6. job priority (int) <br>
380 * Line 7. number of journal entries <br>
381 * Line 8. setting file (with path) <br>
382 * Line 9. statistics tracker file (with path) <br>
383 * Line 10-?. error message (String, empty for null), can be many lines <br>
384 * @param jobFile
385 * a file containing information about the job to load.
386 * @param errorHandler The crawl jobs settings error handler.
387 * null means none is set
388 * @throws InvalidJobFileException
389 * if the specified file does not refer to a valid job file.
390 * @throws IOException
391 * if io operations fail
392 */
393 protected CrawlJob(final File jobFile,
394 final CrawlJobErrorHandler errorHandler)
395 throws InvalidJobFileException, IOException {
396 this(null, null, null, errorHandler,
397 PRIORITY_AVERAGE, null, null, false, true);
398 this.jobDir = jobFile.getParentFile();
399
400
401 if (jobFile.length() == 0) {
402 throw new InvalidJobFileException(jobFile.getCanonicalPath() +
403 " is corrupt (length is zero)");
404 }
405
406
407 BufferedReader jobReader =
408 new BufferedReader(new FileReader(jobFile), 4096);
409
410 this.UID = jobReader.readLine();
411
412 this.name = jobReader.readLine();
413
414 this.status = jobReader.readLine();
415 if(status.equals(STATUS_ABORTED)==false
416 && status.equals(STATUS_CREATED)==false
417 && status.equals(STATUS_DELETED)==false
418 && status.equals(STATUS_FINISHED)==false
419 && status.equals(STATUS_FINISHED_ABNORMAL)==false
420 && status.equals(STATUS_FINISHED_DATA_LIMIT)==false
421 && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false
422 && status.equals(STATUS_FINISHED_TIME_LIMIT)==false
423 && status.equals(STATUS_MISCONFIGURED)==false
424 && status.equals(STATUS_PAUSED)==false
425 && status.equals(STATUS_CHECKPOINTING)==false
426 && status.equals(STATUS_PENDING)==false
427 && status.equals(STATUS_RUNNING)==false
428 && status.equals(STATUS_WAITING_FOR_PAUSE)==false
429 && status.equals(STATUS_PREPARING)==false){
430
431 throw new InvalidJobFileException("Status (line 3) in job file " +
432 "is not valid: '" + status + "'");
433 }
434
435 String tmp = jobReader.readLine();
436 if(tmp.equals("true")){
437 isReadOnly = true;
438 } else if(tmp.equals("false")){
439 isReadOnly = false;
440 } else {
441 throw new InvalidJobFileException("isReadOnly (line 4) in job" +
442 " file '" + jobFile.getAbsolutePath() + "' is not " +
443 "valid: '" + tmp + "'");
444 }
445
446 tmp = jobReader.readLine();
447 if(tmp.equals("true")){
448 this.isRunning = true;
449 } else if(tmp.equals("false")){
450 this.isRunning = false;
451 } else {
452 throw new InvalidJobFileException("isRunning (line 5) in job " +
453 "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
454 "'" + tmp + "'");
455 }
456
457 tmp = jobReader.readLine();
458 try{
459 this.priority = Integer.parseInt(tmp);
460 } catch(NumberFormatException e){
461 throw new InvalidJobFileException("priority (line 5) in job " +
462 "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
463 "'" + tmp + "'");
464 }
465
466 tmp = jobReader.readLine();
467 try{
468 this.numberOfJournalEntries = Integer.parseInt(tmp);
469 } catch(NumberFormatException e){
470 throw new InvalidJobFileException("numberOfJournalEntries " +
471 "(line 5) in job file '" + jobFile.getAbsolutePath() +
472 "' is not valid: " + "'" + tmp + "'");
473 }
474
475 tmp = jobReader.readLine();
476 try {
477 File f = new File(tmp);
478 this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())?
479 f: new File(jobDir, f.getName()));
480 if(this.errorHandler != null){
481 this.settingsHandler.registerValueErrorHandler(errorHandler);
482 }
483 this.settingsHandler.initialize();
484 } catch (InvalidAttributeValueException e1) {
485 throw new InvalidJobFileException("Problem reading from settings " +
486 "file (" + tmp + ") specified in job file '" +
487 jobFile.getAbsolutePath() + "'\n" + e1.getMessage());
488 }
489
490 jobReader.readLine();
491
492
493 tmp = jobReader.readLine();
494 errorMessage = "";
495 while(tmp!=null){
496 errorMessage+=tmp+'\n';
497 tmp = jobReader.readLine();
498 }
499 if(errorMessage.length()==0){
500
501 errorMessage = null;
502 }
503
504
505
506 jobReader.close();
507 }
508
509 /***
510 * Cause the job to be written to persistent storage.
511 * This will also save the statistics tracker if it is not null and the
512 * job status is finished (regardless of how it's finished)
513 */
514 private void writeJobFile() {
515 if (isProfile) {
516 return;
517 }
518
519 final String jobDirAbsolute = jobDir.getAbsolutePath();
520 if (!jobDir.exists() || !jobDir.canWrite()) {
521 logger.warning("Can't update status on " +
522 jobDirAbsolute + " because file does not" +
523 " exist (or is unwriteable)");
524 return;
525 }
526 File f = new File(jobDirAbsolute, "state.job");
527
528 String settingsFile = getSettingsDirectory();
529
530
531 if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) {
532 settingsFile = settingsFile.substring(jobDirAbsolute.length()+1);
533 }
534 try {
535 FileWriter jobWriter = new FileWriter(f, false);
536 try {
537 jobWriter.write(UID + "\n");
538 jobWriter.write(name + "\n");
539 jobWriter.write(status + "\n");
540 jobWriter.write(isReadOnly + "\n");
541 jobWriter.write(isRunning + "\n");
542 jobWriter.write(priority + "\n");
543 jobWriter.write(numberOfJournalEntries + "\n");
544 jobWriter.write(settingsFile + "\n");
545 jobWriter.write(statisticsFileSave + "\n");
546
547
548 if (errorMessage != null) {
549 jobWriter.write(errorMessage + "\n");
550 }
551 } finally {
552 if (jobWriter != null) {
553 jobWriter.close();
554 }
555 }
556 } catch (IOException e) {
557 logger.log(Level.WARNING, "An IOException occured saving job " +
558 name + " (" + UID + ")", e);
559 }
560 }
561
562 /***
563 * Returns this jobs unique ID (UID) that was issued by the
564 * CrawlJobHandler() when this job was first created.
565 *
566 * @return Job This jobs UID.
567 * @see CrawlJobHandler#getNextJobUID()
568 */
569 public String getUID(){
570 return UID;
571 }
572
573 /***
574 * Returns this job's 'name'. The name comes from the settings for this job,
575 * need not be unique and may change. For a unique identifier use
576 * {@link #getUID() getUID()}.
577 * <p>
578 * The name corrisponds to the value of the 'name' tag in the 'meta' section
579 * of the settings file.
580 *
581 * @return This job's 'name'
582 */
583 public String getJobName(){
584 return name;
585 }
586
587 /***
588 * Return the combination of given name and UID most commonly
589 * used in administrative interface.
590 *
591 * @return Job's name with UID notation
592 */
593 public String getDisplayName() {
594 return getJobName()+" ["+getUID()+"]";
595 }
596
597 /***
598 * Set this job's level of priority.
599 *
600 * @param priority The level of priority
601 *
602 * @see #getJobPriority()
603 * @see #PRIORITY_MINIMAL
604 * @see #PRIORITY_LOW
605 * @see #PRIORITY_AVERAGE
606 * @see #PRIORITY_HIGH
607 * @see #PRIORITY_CRITICAL
608 */
609 public void setJobPriority(int priority) {
610 this.priority = priority;
611 }
612
613 /***
614 * Get this job's level of priority.
615 *
616 * @return this job's priority
617 * @see #setJobPriority(int)
618 * @see #PRIORITY_MINIMAL
619 * @see #PRIORITY_LOW
620 * @see #PRIORITY_AVERAGE
621 * @see #PRIORITY_HIGH
622 * @see #PRIORITY_CRITICAL
623 */
624 public int getJobPriority() {
625 return priority;
626 }
627
628 /***
629 * Once called no changes can be made to the settings for this job.
630 * Typically this is done once a crawl is completed and further changes
631 * to the crawl order are therefor meaningless.
632 */
633 public void setReadOnly() {
634 isReadOnly = true;
635 writeJobFile();
636 }
637
638 /***
639 * Is job read only?
640 * @return false until setReadOnly has been invoked, after that it returns true.
641 */
642 public boolean isReadOnly(){
643 return isReadOnly;
644 }
645
646 /***
647 * Set the status of this CrawlJob.
648 *
649 * @param status Current status of CrawlJob
650 * (see constants defined here beginning with STATUS)
651 */
652 public void setStatus(String status) {
653 this.status = status;
654 writeJobFile();
655
656 }
657
658 /***
659 * @return Status of the crawler (Used by JMX).
660 */
661 public String getCrawlStatus() {
662 return this.controller != null?
663 this.controller.getState().toString(): "Illegal State";
664 }
665
666 /***
667 * Get the current status of this CrawlJob
668 *
669 * @return The current status of this CrawlJob
670 * (see constants defined here beginning with STATUS)
671 */
672 public String getStatus() {
673 return this.status;
674 }
675
676 /***
677 * Returns the settings handler for this job. It will have been initialized.
678 * @return the settings handler for this job.
679 */
680 public XMLSettingsHandler getSettingsHandler() {
681 return this.settingsHandler;
682 }
683 /***
684 * Is this a new job?
685 * @return True if is new.
686 */
687 public boolean isNew() {
688 return isNew;
689 }
690
691 /***
692 * Set if the job is considered to be a profile
693 * @return True if is a profile.
694 */
695 public boolean isProfile() {
696 return isProfile;
697 }
698
699 /***
700 * Set if the job is considered a new job or not.
701 * @param b Is the job considered to be new.
702 */
703 public void setNew(boolean b) {
704 isNew = b;
705 writeJobFile();
706 }
707
708 /***
709 * Returns true if the job is being crawled.
710 * @return true if the job is being crawled
711 */
712 public boolean isRunning() {
713 return isRunning;
714 }
715
716 /***
717 * Set if job is being crawled.
718 * @param b Is job being crawled.
719 */
720 protected void setRunning(boolean b) {
721 isRunning = b;
722 writeJobFile();
723
724
725
726
727
728 }
729
730 protected void unregisterMBean() {
731
732 if (this.mbeanServer == null) {
733 return;
734 }
735 try {
736 this.mbeanServer.unregisterMBean(this.mbeanName);
737 this.mbeanServer = null;
738 } catch (Exception e) {
739 logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e);
740 }
741 }
742
743 /***
744 * Subclass of crawlcontroller that unregisters beans when stopped.
745 * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
746 * pollution, so for sure CrawlJob is unregistered with JMX and so any
747 * listeners on the CrawlJob get a chance to get crawl ended message
748 * (These latter notifications may not actually be getting through -- TBD).
749 * <p>TODO: This override dirtys the data model since CC knows about CJs.
750 * The facility provided by this class emitting events and statistics so
751 * they can be read by JMX needs to go back into CC. Probably best to
752 * registering in JMX the CC, rather than CJ. Lets do this in Heritrix 2.0
753 * since means changing the JMX API some.
754 */
755 public class MBeanCrawlController extends CrawlController
756 implements Serializable {
757 private static final long serialVersionUID = -4608537998168407222L;
758 private CrawlJob cj = null;
759 private CompositeType ct = null;
760
761 public CrawlJob getCrawlJob() {
762 return this.cj;
763 }
764
765 public void setCrawlJob(CrawlJob cj) {
766 this.cj = cj;
767 }
768
769 public void progressStatisticsEvent(final EventObject e) {
770 super.progressStatisticsEvent(e);
771 if (this.cj.getMbeanName() == null) {
772
773 return;
774 }
775
776 Map s = ((StatisticsTracking)e.getSource()).getProgressStatistics();
777
778
779 CompositeData cd = null;
780 try {
781 if (this.ct == null) {
782 this.ct = JmxUtils.createCompositeType(s, PROG_STATS,
783 PROG_STATS + " for " + this.cj.getMbeanName());
784 }
785 cd = new CompositeDataSupport(this.ct, s);
786 } catch (OpenDataException ode) {
787 ode.printStackTrace();
788 }
789 if (cd != null) {
790 Notification n = new Notification(PROG_STATS,
791 this.cj.getMbeanName(), getNotificationsSequenceNumber(),
792 ((StatisticsTracking)e.getSource()).
793 getProgressStatisticsLine());
794 n.setUserData(cd);
795 this.cj.sendNotification(n);
796 }
797 }
798
799 protected void completeStop() {
800 try {
801 super.completeStop();
802 } finally {
803 if (this.cj != null) {
804 this.cj.unregisterMBean();
805 }
806 this.cj = null;
807 }
808 }
809 }
810
811 protected CrawlController setupCrawlController()
812 throws InitializationException {
813 CrawlController controller = null;
814
815
816
817
818 Checkpoint cp = CrawlController.
819 getCheckpointRecover(getSettingsHandler().getOrder());
820 if (cp != null) {
821 try {
822 controller = (MBeanCrawlController)CheckpointUtils.
823 readObjectFromFile(MBeanCrawlController.class,
824 cp.getDirectory());
825 } catch (FileNotFoundException e) {
826 throw new InitializationException(e);
827 } catch (IOException e) {
828 throw new InitializationException(e);
829 } catch (ClassNotFoundException e) {
830 throw new InitializationException(e);
831 }
832 } else {
833 controller = new MBeanCrawlController();
834 }
835 return controller;
836 }
837
838 protected CrawlController createCrawlController() {
839 return new MBeanCrawlController();
840 }
841
842 public void setupForCrawlStart()
843 throws InitializationException {
844 try {
845 this.controller = setupCrawlController();
846
847 this.controller.addCrawlStatusListener(this);
848 this.controller.initialize(getSettingsHandler());
849
850 ((MBeanCrawlController)this.controller).setCrawlJob(this);
851
852 this.openMBeanInfo = buildMBeanInfo();
853 try {
854 Heritrix.registerMBean(this, getJmxJobName(),
855 CRAWLJOB_JMXMBEAN_TYPE);
856 } catch (InstanceAlreadyExistsException e) {
857 throw new InitializationException(e);
858 } catch (MBeanRegistrationException e) {
859 throw new InitializationException(e);
860 } catch (NotCompliantMBeanException e) {
861 throw new InitializationException(e);
862 }
863 } catch (InitializationException e) {
864
865 setStatus(CrawlJob.STATUS_MISCONFIGURED);
866 setErrorMessage("A fatal InitializationException occured when "
867 + "loading job:\n" + e.getMessage());
868
869 e.printStackTrace();
870 this.controller = null;
871 throw e;
872 }
873 setStatus(CrawlJob.STATUS_RUNNING);
874 setRunning(true);
875 }
876
877 public void stopCrawling() {
878 if(this.controller != null) {
879 this.controller.requestCrawlStop();
880 }
881 }
882
883 /***
884 * @return One-line Frontier report.
885 */
886 public String getFrontierOneLine() {
887 if (this.controller == null || this.controller.getFrontier() == null) {
888 return "Crawler not running";
889 }
890 return this.controller.getFrontier().singleLineReport();
891 }
892
893 /***
894 * @param reportName Name of report to write.
895 * @return A report of the frontier's status.
896 */
897 public String getFrontierReport(final String reportName) {
898 if (this.controller == null || this.controller.getFrontier() == null) {
899 return "Crawler not running";
900 }
901 return ArchiveUtils.writeReportToString(this.controller.getFrontier(),
902 reportName);
903 }
904
905 /***
906 * Write the requested frontier report to the given PrintWriter
907 * @param reportName Name of report to write.
908 * @param writer Where to write to.
909 */
910 public void writeFrontierReport(String reportName, PrintWriter writer) {
911 if (this.controller == null || this.controller.getFrontier() == null) {
912 writer.println("Crawler not running.");
913 return;
914 }
915 this.controller.getFrontier().reportTo(reportName,writer);
916 }
917
918 /***
919 * @return One-line threads report.
920 */
921 public String getThreadOneLine() {
922 if (this.controller == null) {
923 return "Crawler not running";
924 }
925 return this.controller.oneLineReportThreads();
926 }
927
928 /***
929 * Get the CrawlControllers ToeThreads report for the running crawl.
930 * @return The CrawlControllers ToeThreads report
931 */
932 public String getThreadsReport() {
933 if (this.controller == null) {
934 return "Crawler not running";
935 }
936 return ArchiveUtils.writeReportToString(this.controller.getToePool(),
937 null);
938 }
939
940 /***
941 * Write the requested threads report to the given PrintWriter
942 * @param reportName Name of report to write.
943 * @param writer Where to write to.
944 */
945 public void writeThreadsReport(String reportName, PrintWriter writer) {
946 if (this.controller == null || this.controller.getFrontier() == null) {
947 writer.println("Crawler not running.");
948 return;
949 }
950 this.controller.getToePool().reportTo(reportName, writer);
951 }
952
953 /***
954 * Kills a thread. For details see
955 * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
956 * ToePool.killThread(int, boolean)}.
957 * @param threadNumber Thread to kill.
958 * @param replace Should thread be replaced.
959 * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
960 */
961 public void killThread(int threadNumber, boolean replace) {
962 if (this.controller == null) {
963 return;
964 }
965 this.controller.killThread(threadNumber, replace);
966 }
967
968 /***
969 * Get the Processors report for the running crawl.
970 * @return The Processors report for the running crawl.
971 */
972 public String getProcessorsReport() {
973 if (this.controller == null) {
974 return "Crawler not running";
975 }
976 return ArchiveUtils.writeReportToString(this.controller,
977 CrawlController.PROCESSORS_REPORT);
978 }
979
980 /***
981 * Returns the directory where the configuration files for this job are
982 * located.
983 *
984 * @return the directory where the configuration files for this job are
985 * located
986 */
987 public String getSettingsDirectory() {
988 return settingsHandler.getOrderFile().getPath();
989 }
990
991 /***
992 * Returns the path of the job's base directory. For profiles this is always
993 * equal to <code>new File(getSettingsDirectory())</code>.
994 * @return the path of the job's base directory.
995 */
996 public File getDirectory(){
997 return isProfile? new File(getSettingsDirectory()): jobDir;
998 }
999
1000 /***
1001 * Get the error message associated with this job. Will return null if there
1002 * is no error message.
1003 * @return the error message associated with this job
1004 */
1005 public String getErrorMessage() {
1006 return errorMessage;
1007 }
1008
1009 /***
1010 * Set an error message for this job. Generally this only occurs if the job
1011 * is misconfigured.
1012 * @param string the error message associated with this job
1013 */
1014 public void setErrorMessage(String string) {
1015 errorMessage = string;
1016 writeJobFile();
1017 }
1018
1019 /***
1020 * @return Returns the number of journal entries.
1021 */
1022 public int getNumberOfJournalEntries() {
1023 return numberOfJournalEntries;
1024 }
1025
1026 /***
1027 * @param numberOfJournalEntries The number of journal entries to set.
1028 */
1029 public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1030 this.numberOfJournalEntries = numberOfJournalEntries;
1031 writeJobFile();
1032 }
1033
1034 /***
1035 * @return Returns the error handler for this crawl job
1036 */
1037 public CrawlJobErrorHandler getErrorHandler() {
1038 return errorHandler;
1039 }
1040
1041 /***
1042 * Read all the checkpoints found in the job's checkpoints
1043 * directory into Checkpoint instances
1044 * @return Collection containing list of all checkpoints.
1045 */
1046 public Collection scanCheckpoints() {
1047 File checkpointsDirectory =
1048 settingsHandler.getOrder().getCheckpointsDirectory();
1049 File[] perCheckpointDirs = checkpointsDirectory.listFiles();
1050 Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();
1051 if (perCheckpointDirs != null) {
1052 for (int i = 0; i < perCheckpointDirs.length; i++) {
1053 Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1054 checkpoints.add(cp);
1055 }
1056 }
1057 return checkpoints;
1058 }
1059
1060 /***
1061 * Returns the absolute path of the specified log.
1062 * Note: If crawl has not begun, this file may not exist.
1063 * @param log
1064 * @return the absolute path for the specified log.
1065 * @throws AttributeNotFoundException
1066 * @throws ReflectionException
1067 * @throws MBeanException
1068 */
1069 public String getLogPath(String log)
1070 throws AttributeNotFoundException, MBeanException, ReflectionException {
1071 String logsPath = (String)settingsHandler.getOrder().
1072 getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1073 CrawlOrder order = settingsHandler.getOrder();
1074 String diskPath = (String) order.getAttribute(null,
1075 CrawlOrder.ATTR_DISK_PATH);
1076 File disk = settingsHandler.
1077 getPathRelativeToWorkingDirectory(diskPath);
1078 File f = new File(logsPath, log);
1079 if (!f.isAbsolute()) {
1080 f = new File(disk.getPath(), f.getPath());
1081 }
1082 return f.getAbsolutePath();
1083 }
1084
1085
1086
1087 protected void pause() {
1088 if (this.controller != null && this.controller.isPaused() == false) {
1089 this.controller.requestCrawlPause();
1090 }
1091 }
1092
1093 protected void resume() {
1094 if (this.controller != null) {
1095 this.controller.requestCrawlResume();
1096 }
1097 }
1098
1099 /***
1100 * @throws IllegalStateException Thrown if crawl is not paused.
1101 */
1102 protected void checkpoint() throws IllegalStateException {
1103 if (this.controller != null) {
1104 this.controller.requestCrawlCheckpoint();
1105 }
1106 }
1107
1108 /***
1109 * @return True if checkpointing.
1110 */
1111 public boolean isCheckpointing() {
1112 return this.controller != null? this.controller.isCheckpointing(): false;
1113 }
1114
1115 /***
1116 * If its a HostQueuesFrontier, needs to be flushed for the queued.
1117 */
1118 protected void flush() {
1119
1120 }
1121
1122 /***
1123 * Delete any URI from the frontier of the current (paused) job that match
1124 * the specified regular expression. If the current job is not paused (or
1125 * there is no current job) nothing will be done.
1126 * @param regexpr Regular expression to delete URIs by.
1127 * @return the number of URIs deleted
1128 */
1129 public long deleteURIsFromPending(String regexpr){
1130 return deleteURIsFromPending(regexpr,null);
1131 }
1132
1133 /***
1134 * Delete any URI from the frontier of the current (paused) job that match
1135 * the specified regular expression. If the current job is not paused (or
1136 * there is no current job) nothing will be done.
1137 * @param regexpr Regular expression to delete URIs by.
1138 * @return the number of URIs deleted
1139 */
1140 public long deleteURIsFromPending(String uriPattern, String queuePattern){
1141 return (this.controller != null &&
1142 this.controller.getFrontier() != null &&
1143 this.controller.isPaused())?
1144 this.controller.getFrontier().deleteURIs(uriPattern,queuePattern): 0;
1145 }
1146
1147 public String importUris(String file, String style, String force) {
1148 return importUris(file, style, "true".equals(force));
1149 }
1150
1151 public String importUris(final String fileOrUrl, final String style,
1152 final boolean forceRevisit) {
1153 return importUris(fileOrUrl, style, forceRevisit, false);
1154 }
1155
1156 /***
1157 * @param fileOrUrl Name of file w/ seeds.
1158 * @param style What style of seeds -- crawl log, recovery journal, or
1159 * seeds file.
1160 * @param forceRevisit Should we revisit even if seen before?
1161 * @param areSeeds Is the file exclusively seeds?
1162 * @return A display string that has a count of all added.
1163 */
1164 public String importUris(final String fileOrUrl, final String style,
1165 final boolean forceRevisit, final boolean areSeeds) {
1166 InputStream is =
1167 IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl);
1168 String message = null;
1169
1170 if (is == null) {
1171 message = "Failed to get inputstream from " + fileOrUrl;
1172 logger.severe(message);
1173 } else {
1174 int addedCount = importUris(is, style, forceRevisit, areSeeds);
1175 message = Integer.toString(addedCount) + " URIs added from " +
1176 fileOrUrl;
1177 }
1178 return message;
1179 }
1180
1181 protected int importUris(InputStream is, String style,
1182 boolean forceRevisit) {
1183 return importUris(is, style, forceRevisit, false);
1184 }
1185
1186 /***
1187 * Import URIs.
1188 * @param is Stream to use as URI source.
1189 * @param style Style in which URIs are rendored. Currently support for
1190 * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1191 * format (i.e <code>default</code>) where <code>default</code> style is
1192 * a UURI per line (comments allowed).
1193 * @param forceRevisit Whether we should revisit this URI even if we've
1194 * visited it previously.
1195 * @param areSeeds Are the imported URIs seeds?
1196 * @return Count of added URIs.
1197 */
1198 protected int importUris(InputStream is, String style,
1199 boolean forceRevisit, final boolean areSeeds) {
1200
1201 String extractor;
1202 String output;
1203 if(CRAWL_LOG_STYLE.equals(style)) {
1204
1205 extractor = "//S+//s+//S+//s+//S+//s+(//S+//s+//S+//s+//S+//s+).*";
1206 output = "$1";
1207 } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1208
1209 extractor = "//S+//s+((//S+)(?://s+//S+//s+//S+)?)//s*";
1210 output = "$1";
1211 } else {
1212 extractor =
1213 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1214 output = RegexpLineIterator.ENTRY;
1215 }
1216
1217 controller.installThreadContextSettingsHandler();
1218
1219
1220 BufferedReader br = null;
1221 int addedCount = 0;
1222 try {
1223 br = new BufferedReader(new InputStreamReader(is));
1224 Iterator iter = new RegexpLineIterator(new LineReadingIterator(br),
1225 RegexpLineIterator.COMMENT_LINE, extractor, output);
1226 while(iter.hasNext()) {
1227 try {
1228 importUri((String)iter.next(), forceRevisit, areSeeds,
1229 false);
1230 addedCount++;
1231 } catch (URIException e) {
1232 e.printStackTrace();
1233 }
1234 }
1235 br.close();
1236 flush();
1237 } catch (IOException e) {
1238 e.printStackTrace();
1239 }
1240 return addedCount;
1241 }
1242
1243 /***
1244 * Schedule a uri.
1245 * @param uri Uri to schedule.
1246 * @param forceFetch Should it be forcefetched.
1247 * @param isSeed True if seed.
1248 * @throws URIException
1249 */
1250 public void importUri(final String uri, final boolean forceFetch,
1251 final boolean isSeed)
1252 throws URIException {
1253 importUri(uri, forceFetch, isSeed, true);
1254 }
1255
1256 /***
1257 * Schedule a uri.
1258 * @param str String that can be: 1. a UURI, 2. a snippet of the
1259 * crawl.log line, or 3. a snippet from recover log. See
1260 * {@link #importUris(InputStream, String, boolean)} for how it subparses
1261 * the lines from crawl.log and recover.log.
1262 * @param forceFetch Should it be forcefetched.
1263 * @param isSeed True if seed.
1264 * @param isFlush If true, flush the frontier IF it implements
1265 * flushing.
1266 * @throws URIException
1267 */
1268 public void importUri(final String str, final boolean forceFetch,
1269 final boolean isSeed, final boolean isFlush)
1270 throws URIException {
1271 CandidateURI caUri = CandidateURI.fromString(str);
1272 caUri.setForceFetch(forceFetch);
1273 if (isSeed) {
1274 caUri.setIsSeed(isSeed);
1275 if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1276
1277
1278
1279 this.controller.getScope().addSeed(caUri);
1280 }
1281 }
1282 this.controller.getFrontier().schedule(caUri);
1283 if (isFlush) {
1284 flush();
1285 }
1286 }
1287
1288
1289 /***
1290 * @return Our mbean info (Needed for CrawlJob to qualify as a
1291 * DynamicMBean).
1292 */
1293 public MBeanInfo getMBeanInfo() {
1294 return this.openMBeanInfo;
1295 }
1296
1297 /***
1298 * Build up the MBean info for Heritrix main.
1299 * @return Return created mbean info instance.
1300 * @throws InitializationException
1301 */
1302 protected OpenMBeanInfoSupport buildMBeanInfo()
1303 throws InitializationException {
1304
1305 List<OpenMBeanAttributeInfo> attributes
1306 = new ArrayList<OpenMBeanAttributeInfo>();
1307
1308
1309 attributes.add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,
1310 "Crawl job name", SimpleType.STRING, true, false, false));
1311 attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,
1312 "Short basic status message", SimpleType.STRING, true, false,
1313 false));
1314 attributes.add(
1315 new OpenMBeanAttributeInfoSupport(FRONTIER_SHORT_REPORT_ATTR,
1316 "Short frontier report", SimpleType.STRING, true,
1317 false, false));
1318 attributes.add(
1319 new OpenMBeanAttributeInfoSupport(THREADS_SHORT_REPORT_ATTR,
1320 "Short threads report", SimpleType.STRING, true,
1321 false, false));
1322 attributes.add(new OpenMBeanAttributeInfoSupport(UID_ATTR,
1323 "Crawl job UID", SimpleType.STRING, true, false, false));
1324 attributes.add(new OpenMBeanAttributeInfoSupport(TOTAL_DATA_ATTR,
1325 "Total data received", SimpleType.LONG, true, false, false));
1326 attributes.add(new OpenMBeanAttributeInfoSupport(CRAWL_TIME_ATTR,
1327 "Crawl time", SimpleType.LONG, true, false, false));
1328 attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_DOC_RATE_ATTR,
1329 "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1330 true, false, false));
1331 attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_KB_RATE_ATTR,
1332 "Current crawling rate (Kb/sec)", SimpleType.LONG,
1333 true, false, false));
1334 attributes.add(new OpenMBeanAttributeInfoSupport(THREAD_COUNT_ATTR,
1335 "Active thread count", SimpleType.INTEGER, true, false, false));
1336 attributes.add(new OpenMBeanAttributeInfoSupport(DOC_RATE_ATTR,
1337 "Crawling rate (Docs/sec)", SimpleType.DOUBLE,
1338 true, false, false));
1339 attributes.add(new OpenMBeanAttributeInfoSupport(KB_RATE_ATTR,
1340 "Current crawling rate (Kb/sec)", SimpleType.LONG,
1341 true, false, false));
1342 attributes.add(new OpenMBeanAttributeInfoSupport(DOWNLOAD_COUNT_ATTR,
1343 "Count of downloaded documents", SimpleType.LONG,
1344 true, false, false));
1345 attributes.add(new OpenMBeanAttributeInfoSupport(DISCOVERED_COUNT_ATTR,
1346 "Count of discovered documents", SimpleType.LONG,
1347 true, false, false));
1348
1349
1350 addCrawlOrderAttributes(this.getController().getOrder(), attributes);
1351
1352
1353
1354
1355
1356 Environment env = this.controller.getBdbEnvironment();
1357 try {
1358 this.bdbjeMBeanHelper =
1359 new JEMBeanHelper(env.getConfig(), env.getHome(), true);
1360 } catch (DatabaseException e) {
1361 e.printStackTrace();
1362 InitializationException ie =
1363 new InitializationException(e.getMessage());
1364 ie.setStackTrace(e.getStackTrace());
1365 throw ie;
1366 }
1367 this.bdbjeAttributeNameList = Arrays.asList(new String [] {
1368 JEMBeanHelper.ATT_ENV_HOME,
1369 JEMBeanHelper.ATT_OPEN,
1370 JEMBeanHelper.ATT_IS_READ_ONLY,
1371 JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1372 JEMBeanHelper.ATT_CACHE_SIZE,
1373 JEMBeanHelper.ATT_CACHE_PERCENT,
1374 JEMBeanHelper.ATT_LOCK_TIMEOUT,
1375 JEMBeanHelper.ATT_IS_SERIALIZABLE,
1376 JEMBeanHelper.ATT_SET_READ_ONLY,
1377 });
1378 addBdbjeAttributes(attributes,
1379 this.bdbjeMBeanHelper.getAttributeList(env),
1380 this.bdbjeAttributeNameList);
1381
1382
1383 List<OpenMBeanOperationInfo> operations
1384 = new ArrayList<OpenMBeanOperationInfo>();
1385 OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[3];
1386 args[0] = new OpenMBeanParameterInfoSupport("url",
1387 "URL to add to the frontier", SimpleType.STRING);
1388 args[1] = new OpenMBeanParameterInfoSupport("forceFetch",
1389 "True if URL is to be force fetched", SimpleType.BOOLEAN);
1390 args[2] = new OpenMBeanParameterInfoSupport("seed",
1391 "True if URL is a seed", SimpleType.BOOLEAN);
1392 operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URI_OPER,
1393 "Add passed URL to the frontier", args, SimpleType.VOID,
1394 MBeanOperationInfo.ACTION));
1395
1396 args = new OpenMBeanParameterInfoSupport[4];
1397 args[0] = new OpenMBeanParameterInfoSupport("pathOrUrl",
1398 "Path or URL to file of URLs", SimpleType.STRING);
1399 args[1] = new OpenMBeanParameterInfoSupport("style",
1400 "Format format:default|crawlLog|recoveryJournal",
1401 SimpleType.STRING);
1402 args[2] = new OpenMBeanParameterInfoSupport("forceFetch",
1403 "True if URLs are to be force fetched", SimpleType.BOOLEAN);
1404 args[3] = new OpenMBeanParameterInfoSupport("seed",
1405 "True if all content are seeds.", SimpleType.BOOLEAN);
1406 operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URIS_OPER,
1407 "Add file of passed URLs to the frontier", args, SimpleType.STRING,
1408 MBeanOperationInfo.ACTION));
1409
1410
1411 args = new OpenMBeanParameterInfoSupport[4];
1412 args[0] = new OpenMBeanParameterInfoSupport("filename",
1413 "File to print to", SimpleType.STRING);
1414 args[1] = new OpenMBeanParameterInfoSupport("regexp",
1415 "Regular expression URLs must match", SimpleType.STRING);
1416 args[2] = new OpenMBeanParameterInfoSupport("numberOfMatches",
1417 "Maximum number of matches to return", SimpleType.INTEGER);
1418 args[3] = new OpenMBeanParameterInfoSupport("verbose",
1419 "Should they be verbose descriptions", SimpleType.BOOLEAN);
1420 operations.add(new OpenMBeanOperationInfoSupport(DUMP_URIS_OPER,
1421 "Dump pending URIs from frontier to a file", args,
1422 SimpleType.VOID, MBeanOperationInfo.ACTION));
1423
1424 operations.add(new OpenMBeanOperationInfoSupport(PAUSE_OPER,
1425 "Pause crawling (noop if already paused)", null, SimpleType.VOID,
1426 MBeanOperationInfo.ACTION));
1427
1428 operations.add(new OpenMBeanOperationInfoSupport(RESUME_OPER,
1429 "Resume crawling (noop if already resumed)", null,
1430 SimpleType.VOID, MBeanOperationInfo.ACTION));
1431
1432 args = new OpenMBeanParameterInfoSupport[1];
1433 args[0] = new OpenMBeanParameterInfoSupport("name",
1434 "Name of report ('all', 'standard', etc.).", SimpleType.STRING);
1435 operations.add(new OpenMBeanOperationInfoSupport(FRONTIER_REPORT_OPER,
1436 "Full frontier report", args, SimpleType.STRING,
1437 MBeanOperationInfo.INFO));
1438
1439 operations.add(new OpenMBeanOperationInfoSupport(THREADS_REPORT_OPER,
1440 "Full thread report", null, SimpleType.STRING,
1441 MBeanOperationInfo.INFO));
1442
1443 operations.add(new OpenMBeanOperationInfoSupport(SEEDS_REPORT_OPER,
1444 "Seeds report", null, SimpleType.STRING, MBeanOperationInfo.INFO));
1445
1446 operations.add(
1447 new OpenMBeanOperationInfoSupport(PROGRESS_STATISTICS_OPER,
1448 "Progress statistics at time of invocation", null,
1449 SimpleType.STRING, MBeanOperationInfo.INFO));
1450
1451 operations.add(new OpenMBeanOperationInfoSupport(
1452 PROGRESS_STATISTICS_LEGEND_OPER,
1453 "Progress statistics legend", null,
1454 SimpleType.STRING, MBeanOperationInfo.INFO));
1455
1456 operations.add(new OpenMBeanOperationInfoSupport(CHECKPOINT_OPER,
1457 "Start a checkpoint", null, SimpleType.VOID,
1458 MBeanOperationInfo.ACTION));
1459
1460
1461
1462
1463 this.bdbjeOperationsNameList = Arrays.asList(new String[] { "cleanLog",
1464 "evictMemory", "checkpoint", "sync",
1465 "getEnvironmentStatsToString", "getLockStatsToString",
1466 "getDatabaseNames", OP_DB_STAT
1467 });
1468 addBdbjeOperations(operations,
1469 this.bdbjeMBeanHelper.getOperationList(env),
1470 this.bdbjeOperationsNameList);
1471
1472
1473 List<MBeanNotificationInfo> notifications
1474 = new ArrayList<MBeanNotificationInfo>();
1475 notifications.add(
1476 new MBeanNotificationInfo(new String [] {"crawlStarted",
1477 "crawlEnding", "crawlPaused", "crawlResuming", PROG_STATS},
1478 this.getClass().getName() + ".notifications",
1479 "CrawlStatusListener events and progress statistics as " +
1480 "notifications"));
1481 MBeanNotificationInfo [] notificationsArray =
1482 new MBeanNotificationInfo[notifications.size()];
1483 notifications.toArray(notificationsArray);
1484
1485
1486 OpenMBeanAttributeInfoSupport[] attributesArray =
1487 new OpenMBeanAttributeInfoSupport[attributes.size()];
1488 attributes.toArray(attributesArray);
1489 OpenMBeanOperationInfoSupport[] operationsArray =
1490 new OpenMBeanOperationInfoSupport[operations.size()];
1491 operations.toArray(operationsArray);
1492 return new OpenMBeanInfoSupport(this.getClass().getName(),
1493 "Current Crawl Job as OpenMBean",
1494 attributesArray,
1495 new OpenMBeanConstructorInfoSupport [] {},
1496 operationsArray,
1497 notificationsArray);
1498 }
1499
1500 protected void addBdbjeAttributes(
1501 final List<OpenMBeanAttributeInfo> attributes,
1502 final List<MBeanAttributeInfo> bdbjeAttributes,
1503 final List<String> bdbjeNamesToAdd) {
1504 for (MBeanAttributeInfo info: bdbjeAttributes) {
1505 if (bdbjeNamesToAdd.contains(info.getName())) {
1506 attributes.add(JmxUtils.convertToOpenMBeanAttribute(info));
1507 }
1508 }
1509 }
1510
1511 protected void addBdbjeOperations(
1512 final List<OpenMBeanOperationInfo> operations,
1513 final List<MBeanOperationInfo> bdbjeOperations,
1514 final List<String> bdbjeNamesToAdd) {
1515 for (MBeanOperationInfo info: bdbjeOperations) {
1516 if (bdbjeNamesToAdd.contains(info.getName())) {
1517 OpenMBeanOperationInfo omboi = null;
1518 if (info.getName().equals(OP_DB_STAT)) {
1519
1520
1521
1522 omboi = JmxUtils.convertToOpenMBeanOperation(info, null,
1523 SimpleType.STRING);
1524 MBeanParameterInfo[] params = omboi.getSignature();
1525 OpenMBeanParameterInfo[] args =
1526 new OpenMBeanParameterInfoSupport[params.length + 1];
1527 for (int ii = 0; ii < params.length; ii++) {
1528 args[ii] = (OpenMBeanParameterInfo) params[ii];
1529 }
1530 args[params.length] = new OpenMBeanParameterInfoSupport(
1531 "name", "Database name", SimpleType.STRING);
1532 omboi = new OpenMBeanOperationInfoSupport(omboi.getName(),
1533 omboi.getDescription(), args, omboi.getReturnOpenType(),
1534 omboi.getImpact());
1535 } else {
1536 omboi = JmxUtils.convertToOpenMBeanOperation(info);
1537 }
1538 operations.add(omboi);
1539 }
1540 }
1541 }
1542
1543 protected void addCrawlOrderAttributes(final ComplexType type,
1544 final List<OpenMBeanAttributeInfo> attributes) {
1545 for (final Iterator i = type.getAttributeInfoIterator(null);
1546 i.hasNext();) {
1547 ModuleAttributeInfo info = (ModuleAttributeInfo)i.next();
1548 if (ORDER_EXCLUDE.contains(info.getName())) {
1549
1550 continue;
1551 }
1552 String absoluteName = type.getAbsoluteName() + "/" + info.getName();
1553 if (JmxUtils.isOpenType(info.getType())) {
1554 String description = info.getDescription();
1555 if (description == null || description.length() <= 0) {
1556
1557 description = info.getName();
1558 }
1559 attributes.add(new OpenMBeanAttributeInfoSupport(
1560 absoluteName, description,
1561 JmxUtils.getOpenType(info.getType()), true, true, false));
1562 } else if(info.isComplexType()) {
1563 try {
1564 ComplexType c =
1565 (ComplexType)type.getAttribute(info.getName());
1566 addCrawlOrderAttributes(c, attributes);
1567 } catch (AttributeNotFoundException e) {
1568 logger.log(Level.SEVERE, "Failed get of attribute", e);
1569 } catch (MBeanException e) {
1570 logger.log(Level.SEVERE, "Failed get of attribute", e);
1571 } catch (ReflectionException e) {
1572 logger.log(Level.SEVERE, "Failed get of attribute", e);
1573 }
1574 } else if (info.getType().equals(TextField.class.getName())) {
1575
1576 attributes.add(new OpenMBeanAttributeInfoSupport(
1577 absoluteName, info.getDescription(),
1578 SimpleType.STRING, true, true, false));
1579 } else {
1580
1581
1582 logger.fine(info.getType());
1583 }
1584 }
1585 }
1586
1587 public Object getAttribute(String attribute_name)
1588 throws AttributeNotFoundException {
1589 if (attribute_name == null) {
1590 throw new RuntimeOperationsException(
1591 new IllegalArgumentException("Attribute name cannot be null"),
1592 "Cannot call getAttribute with null attribute name");
1593 }
1594
1595
1596 if (this.controller == null) {
1597 throw new RuntimeOperationsException(
1598 new NullPointerException("Controller is null"),
1599 "Controller is null");
1600 }
1601
1602
1603 if (this.bdbjeAttributeNameList.contains(attribute_name)) {
1604 try {
1605 return this.bdbjeMBeanHelper.getAttribute(
1606 this.controller.getBdbEnvironment(), attribute_name);
1607 } catch (MBeanException e) {
1608 throw new RuntimeOperationsException(new RuntimeException(e));
1609 }
1610 }
1611
1612
1613 if (attribute_name.
1614 startsWith(this.controller.getOrder().getAbsoluteName())) {
1615 return getCrawlOrderAttribute(attribute_name);
1616 }
1617
1618 if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1619 throw new AttributeNotFoundException("Attribute " +
1620 attribute_name + " is unimplemented.");
1621 }
1622
1623
1624
1625
1626
1627 if (attribute_name.equals(STATUS_ATTR)) {
1628 return getCrawlStatus();
1629 }
1630 if (attribute_name.equals(NAME_ATTR)) {
1631 return getJobName();
1632 }
1633 if (attribute_name.equals(UID_ATTR)) {
1634 return getUID();
1635 }
1636 if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1637 return new Long(this.controller == null &&
1638 this.controller.getStatistics() != null? 0:
1639 this.controller.getStatistics().totalBytesWritten());
1640 }
1641 if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1642 return new Long(this.controller == null &&
1643 this.controller.getStatistics() != null? 0:
1644 this.controller.getStatistics().getCrawlerTotalElapsedTime() /
1645 1000);
1646 }
1647 if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1648 return new Double(this.controller == null &&
1649 this.controller.getStatistics() != null? 0:
1650 this.controller.getStatistics().currentProcessedDocsPerSec());
1651 }
1652 if (attribute_name.equals(DOC_RATE_ATTR)) {
1653 return new Double(this.controller == null &&
1654 this.controller.getStatistics() != null? 0:
1655 this.controller.getStatistics().processedDocsPerSec());
1656 }
1657 if (attribute_name.equals(KB_RATE_ATTR)) {
1658 return new Long(this.controller == null &&
1659 this.controller.getStatistics() != null? 0:
1660 this.controller.getStatistics().currentProcessedKBPerSec());
1661 }
1662 if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1663 return new Long(this.controller == null &&
1664 this.controller.getStatistics() != null? 0:
1665 this.controller.getStatistics().processedKBPerSec());
1666 }
1667 if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1668 return new Integer(this.controller == null &&
1669 this.controller.getStatistics() != null? 0:
1670 this.controller.getStatistics().activeThreadCount());
1671 }
1672 if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1673 return getFrontierOneLine();
1674 }
1675 if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1676 return getThreadOneLine();
1677 }
1678 if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1679 return new Long(this.controller == null &&
1680 this.controller.getStatistics() != null? 0:
1681 this.controller.getStatistics().totalCount());
1682 }
1683 if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1684 return new Long(this.controller == null &&
1685 this.controller.getStatistics() != null? 0:
1686 this.controller.getStatistics().successfullyFetchedCount());
1687 }
1688
1689 throw new AttributeNotFoundException("Attribute " +
1690 attribute_name + " not found.");
1691 }
1692
1693 protected Object getCrawlOrderAttribute(final String attribute_name) {
1694 CrawlOrder order = this.getController().getOrder();
1695 Object result = null;
1696 try {
1697 result = getCrawlOrderAttribute(attribute_name.substring(order
1698 .getAbsoluteName().length()), order);
1699 } catch (NullPointerException e) {
1700 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1701 } catch (AttributeNotFoundException e) {
1702 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1703 } catch (MBeanException e) {
1704 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1705 } catch (ReflectionException e) {
1706 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1707 }
1708 return result;
1709 }
1710
1711 protected Object getCrawlOrderAttribute(final String attribute_name,
1712 final ComplexType ct)
1713 throws AttributeNotFoundException, MBeanException, ReflectionException {
1714 String subName = attribute_name.startsWith("/") ? attribute_name
1715 .substring(1) : attribute_name;
1716 int index = subName.indexOf("/");
1717 if (index <= 0) {
1718 MBeanAttributeInfo info = ct.getAttributeInfo(subName);
1719
1720 return info.getType().equals(TextField.class.getName()) ? ct
1721 .getAttribute(subName).toString() : ct
1722 .getAttribute(subName);
1723 }
1724 return getCrawlOrderAttribute(subName.substring(index + 1),
1725 (ComplexType) ct.getAttribute(subName.substring(0, index)));
1726 }
1727
1728 public AttributeList getAttributes(String [] attributeNames) {
1729 if (attributeNames == null) {
1730 throw new RuntimeOperationsException(
1731 new IllegalArgumentException("attributeNames[] cannot be " +
1732 "null"), "Cannot call getAttributes with null attribute " +
1733 "names");
1734 }
1735
1736
1737 if (this.controller == null) {
1738 throw new RuntimeOperationsException(
1739 new NullPointerException("Controller is null"),
1740 "Controller is null");
1741 }
1742
1743 AttributeList resultList = new AttributeList();
1744 if (attributeNames.length == 0) {
1745 return resultList;
1746 }
1747 for (int i = 0; i < attributeNames.length; i++) {
1748 try {
1749 Object value = getAttribute(attributeNames[i]);
1750 resultList.add(new Attribute(attributeNames[i], value));
1751 } catch (Exception e) {
1752 e.printStackTrace();
1753 }
1754 }
1755 return(resultList);
1756 }
1757
1758 public void setAttribute(Attribute attribute)
1759 throws AttributeNotFoundException {
1760
1761 CrawlOrder order = this.getController().getOrder();
1762 String attName = attribute.getName();
1763 if (attName.startsWith(order.getAbsoluteName())) {
1764 try {
1765 setCrawlOrderAttribute(attribute.getName().substring(
1766 order.getAbsoluteName().length()), order, attribute);
1767 } catch (NullPointerException e) {
1768 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1769 } catch (AttributeNotFoundException e) {
1770 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1771 } catch (MBeanException e) {
1772 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1773 } catch (ReflectionException e) {
1774 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1775 } catch (InvalidAttributeValueException e) {
1776 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1777 }
1778 return;
1779 }
1780
1781
1782 if (this.bdbjeAttributeNameList.contains(attName)) {
1783 try {
1784 this.bdbjeMBeanHelper.setAttribute(this.controller
1785 .getBdbEnvironment(), attribute);
1786 } catch (AttributeNotFoundException e) {
1787 throw new RuntimeOperationsException(new RuntimeException(e));
1788 } catch (InvalidAttributeValueException e) {
1789 throw new RuntimeOperationsException(new RuntimeException(e));
1790 }
1791 return;
1792 }
1793
1794
1795 throw new AttributeNotFoundException("Attribute " + attName +
1796 " can not be set.");
1797 }
1798
1799 protected void setCrawlOrderAttribute(final String attribute_name,
1800 final ComplexType ct, final Attribute attribute)
1801 throws AttributeNotFoundException, InvalidAttributeValueException,
1802 MBeanException, ReflectionException {
1803 String subName = attribute_name.startsWith("/") ? attribute_name
1804 .substring(1) : attribute_name;
1805 int index = subName.indexOf("/");
1806 if (index <= 0) {
1807 ct.setAttribute(new Attribute(subName, attribute.getValue()));
1808 return;
1809 }
1810 setCrawlOrderAttribute(subName.substring(index + 1), (ComplexType) ct
1811 .getAttribute(subName.substring(0, index)), attribute);
1812 }
1813
1814 public AttributeList setAttributes(AttributeList attributes) {
1815 if (attributes == null) {
1816 throw new RuntimeOperationsException(
1817 new IllegalArgumentException("attributeNames[] cannot be " +
1818 "null"), "Cannot call getAttributes with null attribute " +
1819 "names");
1820 }
1821
1822 AttributeList resultList = new AttributeList();
1823 if (attributes.size() == 0) {
1824 return resultList;
1825 }
1826 for (int i = 0; i < attributes.size(); i++) {
1827 try {
1828 Attribute attr = (Attribute)attributes.get(i);
1829 setAttribute(attr);
1830 String an = attr.getName();
1831 Object newValue = getAttribute(an);
1832 resultList.add(new Attribute(an, newValue));
1833 } catch (Exception e) {
1834 e.printStackTrace();
1835 }
1836 }
1837 return resultList;
1838 }
1839
1840 public Object invoke(String operationName, Object[] params,
1841 String[] signature)
1842 throws ReflectionException {
1843 if (operationName == null) {
1844 throw new RuntimeOperationsException(
1845 new IllegalArgumentException("Operation name cannot be null"),
1846 "Cannot call invoke with null operation name");
1847 }
1848
1849 controller.installThreadContextSettingsHandler();
1850
1851 if (this.bdbjeOperationsNameList.contains(operationName)) {
1852 try {
1853 Object o = this.bdbjeMBeanHelper.invoke(
1854 this.controller.getBdbEnvironment(),
1855 operationName, params, signature);
1856
1857 if (operationName.equals(OP_DB_STAT)) {
1858 return o.toString();
1859 }
1860 return o;
1861 } catch (MBeanException e) {
1862 throw new RuntimeOperationsException(new RuntimeException(e));
1863 }
1864 }
1865
1866
1867
1868
1869
1870
1871
1872 if (operationName.equals(IMPORT_URI_OPER)) {
1873 JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1874 mustBeCrawling();
1875 try {
1876 importUri((String)params[0],
1877 ((Boolean)params[1]).booleanValue(),
1878 ((Boolean)params[2]).booleanValue());
1879 } catch (URIException e) {
1880 throw new RuntimeOperationsException(new RuntimeException(e));
1881 }
1882 return null;
1883 }
1884
1885 if (operationName.equals(IMPORT_URIS_OPER)) {
1886 JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1887 mustBeCrawling();
1888 return importUris((String)params[0],
1889 ((String)params[1]).toString(),
1890 ((Boolean)params[2]).booleanValue(),
1891 ((Boolean)params[3]).booleanValue());
1892 }
1893
1894 if (operationName.equals(DUMP_URIS_OPER)) {
1895 JmxUtils.checkParamsCount(DUMP_URIS_OPER, params, 4);
1896 mustBeCrawling();
1897 if (!this.controller.isPaused()) {
1898 throw new RuntimeOperationsException(
1899 new IllegalArgumentException("Must " + "be paused"),
1900 "Cannot dump URI's from running job.");
1901 }
1902 dumpUris((String) params[0], (String) params[1],
1903 ((Integer) params[2]).intValue(), ((Boolean) params[3])
1904 .booleanValue());
1905 }
1906
1907 if (operationName.equals(PAUSE_OPER)) {
1908 JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1909 mustBeCrawling();
1910 pause();
1911 return null;
1912 }
1913
1914 if (operationName.equals(RESUME_OPER)) {
1915 JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1916 mustBeCrawling();
1917 resume();
1918 return null;
1919 }
1920
1921 if (operationName.equals(FRONTIER_REPORT_OPER)) {
1922 JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1923 mustBeCrawling();
1924 return getFrontierReport((String)params[0]);
1925 }
1926
1927 if (operationName.equals(THREADS_REPORT_OPER)) {
1928 JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1929 mustBeCrawling();
1930 return getThreadsReport();
1931 }
1932
1933 if (operationName.equals(SEEDS_REPORT_OPER)) {
1934 JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1935 mustBeCrawling();
1936 StringWriter sw = new StringWriter();
1937 if (getStatisticsTracking() != null &&
1938 getStatisticsTracking() instanceof StatisticsTracker) {
1939 ((StatisticsTracker)getStatisticsTracking()).
1940 writeSeedsReportTo(new PrintWriter(sw));
1941 } else {
1942 sw.write("Unsupported");
1943 }
1944 return sw.toString();
1945 }
1946
1947 if (operationName.equals(CHECKPOINT_OPER)) {
1948 JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1949 mustBeCrawling();
1950 try {
1951 checkpoint();
1952 } catch (IllegalStateException e) {
1953 throw new RuntimeOperationsException(e);
1954 }
1955 return null;
1956 }
1957
1958 if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1959 JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params, 0);
1960 mustBeCrawling();
1961 return getStatisticsTracking().getProgressStatisticsLine();
1962 }
1963
1964 if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1965 JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1966 params, 0);
1967 return getStatisticsTracking().progressStatisticsLegend();
1968 }
1969
1970 throw new ReflectionException(
1971 new NoSuchMethodException(operationName),
1972 "Cannot find the operation " + operationName);
1973 }
1974
1975 public void mustBeCrawling() {
1976 if (!isCrawling()) {
1977 throw new RuntimeOperationsException(
1978 new IllegalArgumentException("Not " +
1979 "crawling (Shouldn't ever be the case)"),
1980 "Not current crawling job?");
1981 }
1982 }
1983
1984 public boolean isCrawling() {
1985 return this.controller != null;
1986 }
1987
1988 /***
1989 * Utility method to get the stored list of ignored seed items (if any),
1990 * from the last time the seeds were imported to the frontier.
1991 *
1992 * @return String of all ignored seed items, or null if none
1993 */
1994 public String getIgnoredSeeds() {
1995 File ignoredFile = new File(getDirectory(),
1996 AbstractFrontier.IGNORED_SEEDS_FILENAME);
1997 if(!ignoredFile.exists()) {
1998 return null;
1999 }
2000 try {
2001 return FileUtils.readFileAsString(ignoredFile);
2002 } catch (IOException e) {
2003
2004 e.printStackTrace();
2005 return null;
2006 }
2007 }
2008
2009 /***
2010 * Forward a 'kick' update to current controller if any.
2011 * @see CrawlController#kickUpdate()
2012 */
2013 public void kickUpdate(){
2014 if (this.controller != null){
2015 this.controller.kickUpdate();
2016 }
2017 }
2018
2019 /***
2020 * Returns a URIFrontierMarker for the current, paused, job. If there is no
2021 * current job or it is not paused null will be returned.
2022 *
2023 * @param regexpr A regular expression that each URI must match in order to
2024 * be considered 'within' the marker.
2025 * @param inCacheOnly Limit marker scope to 'cached' URIs.
2026 * @return a URIFrontierMarker for the current job.
2027 * @see #getPendingURIsList(FrontierMarker, int, boolean)
2028 * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
2029 * boolean)
2030 * @see org.archive.crawler.framework.FrontierMarker
2031 */
2032 public FrontierMarker getInitialMarker(String regexpr,
2033 boolean inCacheOnly) {
2034 return (this.controller != null && this.controller.isPaused())?
2035 this.controller.getFrontier().getInitialMarker(regexpr, inCacheOnly):
2036 null;
2037 }
2038
2039 /***
2040 * Returns the frontiers URI list based on the provided marker. This method
2041 * will return null if there is not current job or if the current job is
2042 * not paused. Only when there is a paused current job will this method
2043 * return a URI list.
2044 *
2045 * @param marker URIFrontier marker
2046 * @param numberOfMatches Maximum number of matches to return
2047 * @param verbose Should detailed info be provided on each URI?
2048 * @return the frontiers URI list based on the provided marker
2049 * @throws InvalidFrontierMarkerException
2050 * When marker is inconsistent with the current state of the
2051 * frontier.
2052 * @see #getInitialMarker(String, boolean)
2053 * @see org.archive.crawler.framework.FrontierMarker
2054 */
2055 public ArrayList<String> getPendingURIsList(FrontierMarker marker,
2056 int numberOfMatches, boolean verbose)
2057 throws InvalidFrontierMarkerException {
2058 return (this.controller != null && this.controller.isPaused())?
2059 this.controller.getFrontier().getURIsList(marker, numberOfMatches,
2060 verbose):
2061 null;
2062 }
2063
2064 public void dumpUris(String filename, String regexp, int numberOfMatches,
2065 boolean verbose) {
2066 try {
2067 PrintWriter out = new PrintWriter(filename);
2068 FrontierMarker marker =
2069 controller.getFrontier().getInitialMarker(regexp, false);
2070 int matchesDumped = 0;
2071
2072 while(matchesDumped<numberOfMatches) {
2073 int batchMatches = Math.min(100, numberOfMatches-matchesDumped);
2074
2075 ArrayList<String> batchOfUris =
2076 getPendingURIsList(marker,batchMatches,false);
2077 for(String uriLine : batchOfUris) {
2078 out.write(uriLine);
2079 out.write("\n");
2080 matchesDumped++;
2081 }
2082 if (batchOfUris.size()<batchMatches) {
2083
2084 break;
2085 }
2086 }
2087 IOUtils.closeQuietly(out);
2088 } catch (FileNotFoundException e) {
2089 logger.log(Level.SEVERE, "Failed dumpUris write", e);
2090 } catch (InvalidFrontierMarkerException e) {
2091 logger.log(Level.SEVERE, "Failed dumpUris", e);
2092 }
2093 }
2094
2095 public void crawlStarted(String message) {
2096 if (this.mbeanName != null) {
2097
2098 sendNotification(new Notification("crawlStarted",
2099 this.mbeanName, getNotificationsSequenceNumber(), message));
2100 }
2101 }
2102
2103 public void crawlEnding(String sExitMessage) {
2104 setRunning(false);
2105 setStatus(sExitMessage);
2106 setReadOnly();
2107 if (this.mbeanName != null) {
2108 sendNotification(new Notification("crawlEnding", this.mbeanName,
2109 getNotificationsSequenceNumber(), sExitMessage));
2110 }
2111 }
2112
2113 public void crawlEnded(String sExitMessage) {
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125 }
2126
2127 public void crawlPausing(String statusMessage) {
2128 setStatus(statusMessage);
2129 }
2130
2131 public void crawlPaused(String statusMessage) {
2132 setStatus(statusMessage);
2133 if (this.mbeanName != null) {
2134
2135 sendNotification(new Notification("crawlPaused", this.mbeanName,
2136 getNotificationsSequenceNumber(), statusMessage));
2137 }
2138 }
2139
2140 public void crawlResuming(String statusMessage) {
2141 setStatus(statusMessage);
2142 if (this.mbeanName != null) {
2143
2144 sendNotification(new Notification("crawlResuming", this.mbeanName,
2145 getNotificationsSequenceNumber(), statusMessage));
2146 }
2147 }
2148
2149 public void crawlCheckpoint(File checkpointDir) throws Exception {
2150 setStatus(CrawlJob.STATUS_CHECKPOINTING);
2151 }
2152
2153 public CrawlController getController() {
2154 return this.controller;
2155 }
2156
2157 public ObjectName preRegister(final MBeanServer server, ObjectName on)
2158 throws Exception {
2159 this.mbeanServer = server;
2160 @SuppressWarnings("unchecked")
2161 Hashtable<String,String> ht = on.getKeyPropertyList();
2162 if (!ht.containsKey(JmxUtils.NAME)) {
2163 throw new IllegalArgumentException("Name property required" +
2164 on.getCanonicalName());
2165 }
2166
2167
2168
2169 Heritrix h = getHostingHeritrix();
2170 if (h == null || h.getMBeanName() == null) {
2171 throw new IllegalArgumentException("Hosting heritrix not found " +
2172 "or not registered with JMX: " + on.getCanonicalName());
2173 }
2174 @SuppressWarnings("unchecked")
2175 Map<String,String> hht = h.getMBeanName().getKeyPropertyList();
2176 ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2177 String port = hht.get(JmxUtils.JMX_PORT);
2178 if (port != null) {
2179 ht.put(JmxUtils.JMX_PORT, port);
2180 }
2181 ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2182 if (!ht.containsKey(JmxUtils.TYPE)) {
2183 ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2184 }
2185 this.mbeanName = new ObjectName(on.getDomain(), ht);
2186 return this.mbeanName;
2187 }
2188
2189 public void postRegister(Boolean registrationDone) {
2190 if (logger.isLoggable(Level.INFO)) {
2191 logger.info(
2192 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2193 this.mbeanServer, registrationDone.booleanValue()));
2194 }
2195 }
2196
2197 public void preDeregister() throws Exception {
2198
2199 }
2200
2201 public void postDeregister() {
2202 if (mbeanName == null) {
2203 return;
2204 }
2205 if (logger.isLoggable(Level.INFO)) {
2206 logger.info(JmxUtils.getLogUnregistrationMsg(
2207 this.mbeanName.getCanonicalName(), this.mbeanServer));
2208 }
2209 this.mbeanName = null;
2210 }
2211
2212 /***
2213 * @return Heritrix that is hosting this job.
2214 */
2215 protected Heritrix getHostingHeritrix() {
2216 Heritrix hostingHeritrix = null;
2217 Map heritrice = Heritrix.getInstances();
2218 for (final Iterator i = heritrice.keySet().iterator(); i.hasNext();) {
2219 Heritrix h = (Heritrix)heritrice.get(i.next());
2220 if (h.getJobHandler().getCurrentJob() == this) {
2221 hostingHeritrix = h;
2222 break;
2223 }
2224 }
2225 return hostingHeritrix;
2226 }
2227
2228 /***
2229 * @return Unique name for job that is safe to use in jmx (Like display
2230 * name but without spaces).
2231 */
2232 public String getJmxJobName() {
2233 return getJobName() + "-" + getUID();
2234 }
2235
2236 /***
2237 * @return Notification sequence number (Does increment after each access).
2238 */
2239 protected static int getNotificationsSequenceNumber() {
2240 return notificationsSequenceNumber++;
2241 }
2242
2243 protected ObjectName getMbeanName() {
2244 return this.mbeanName;
2245 }
2246
2247 /***
2248 * @return the statistics tracking instance (of null if none yet available).
2249 */
2250 public StatisticsTracking getStatisticsTracking() {
2251 return this.controller == null ||
2252 this.controller.getStatistics() == null? null:
2253 this.controller.getStatistics();
2254 }
2255 }