1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.admin;
24
25 import java.io.BufferedReader;
26 import java.io.BufferedWriter;
27 import java.io.File;
28 import java.io.FileWriter;
29 import java.io.FilenameFilter;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.InputStreamReader;
33 import java.net.URL;
34 import java.net.URI;
35 import java.util.ArrayList;
36 import java.util.Comparator;
37 import java.util.Date;
38 import java.util.Enumeration;
39 import java.util.Iterator;
40 import java.util.List;
41 import java.util.TreeSet;
42 import java.util.logging.Level;
43 import java.util.logging.Logger;
44
45 import javax.management.Attribute;
46 import javax.management.AttributeNotFoundException;
47 import javax.management.InvalidAttributeValueException;
48 import javax.management.MBeanException;
49 import javax.management.ReflectionException;
50
51 import org.apache.commons.httpclient.URIException;
52 import org.archive.crawler.Heritrix;
53 import org.archive.crawler.datamodel.CrawlOrder;
54 import org.archive.crawler.event.CrawlStatusListener;
55 import org.archive.crawler.framework.FrontierMarker;
56 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
57 import org.archive.crawler.framework.exceptions.InitializationException;
58 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
59 import org.archive.crawler.frontier.FrontierJournal;
60 import org.archive.crawler.frontier.RecoveryJournal;
61 import org.archive.crawler.settings.ComplexType;
62 import org.archive.crawler.settings.CrawlerSettings;
63 import org.archive.crawler.settings.SettingsHandler;
64 import org.archive.crawler.settings.XMLSettingsHandler;
65 import org.archive.util.ArchiveUtils;
66 import org.archive.util.FileUtils;
67
68
69 /***
70 * This class manages CrawlJobs. Submitted crawl jobs are queued up and run
71 * in order when the crawler is running.
72 * <p>Basically this provides a layer between any potential user interface and
73 * the CrawlJobs. It keeps the lists of completed jobs, pending jobs, etc.
74 * <p>
75 * The jobs managed by the handler can be divided into the following:
76 * <ul>
77 * <li> <code>Pending</code> - Jobs that are ready to run and are waiting their
78 * turn. These can be edited, viewed, deleted etc.
79 * <li> <code>Running</code> - Only one job can be running at a time. There may
80 * be no job running. The running job can be viewed
81 * and edited to some extent. It can also be
82 * terminated. This job should have a
83 * StatisticsTracking module attached to it for more
84 * details on the crawl.
85 * <li><code>Completed</code> - Jobs that have finished crawling or have been
86 * deleted from the pending queue or terminated
87 * while running. They can not be edited but can be
88 * viewed. They retain the StatisticsTracking
89 * module from their run.
90 * <li> <code>New job</code> - At any given time their can be one 'new job' the
91 * new job is not considered ready to run. It can
92 * be edited or discarded (in which case it will be
93 * totally destroyed, including any files on disk).
94 * Once an operator deems the job ready to run it
95 * can be moved to the pending queue.
96 * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can
97 * be edited normally but can not be submitted to
98 * the pending queue. New jobs can be created
99 * using a profile as it's template.
100 *
101 * @author Kristinn Sigurdsson
102 *
103 * @see org.archive.crawler.admin.CrawlJob
104 */
105
106 public class CrawlJobHandler implements CrawlStatusListener {
107 private static final Logger logger =
108 Logger.getLogger(CrawlJobHandler.class.getName());
109
110 /***
111 * Name of system property whose specification overrides default profile
112 * used.
113 *
114 */
115 public static final String DEFAULT_PROFILE_NAME
116 = "heritrix.default.profile";
117
118 /***
119 * Default profile name.
120 */
121 public static final String DEFAULT_PROFILE = "default";
122
123 /***
124 * Name of the profiles directory.
125 */
126 public static final String PROFILES_DIR_NAME = "profiles";
127
128 public static final String ORDER_FILE_NAME = "order.xml";
129
130 /***
131 * Job currently being crawled.
132 */
133 private CrawlJob currentJob = null;
134
135 /***
136 * A new job that is being created/configured. Not yet ready for crawling.
137 */
138 private CrawlJob newJob = null;
139
140 /***
141 * Thread to start the next job in background
142 */
143 private Thread startingNextJob = null;
144
145 /***
146 * A list of pending CrawlJobs.
147 */
148 private TreeSet<CrawlJob> pendingCrawlJobs;
149
150 /***
151 * A list of completed CrawlJobs.
152 */
153
154 private TreeSet<CrawlJob> completedCrawlJobs;
155
156 /***
157 * A list of profile CrawlJobs.
158 */
159 private TreeSet<CrawlJob> profileJobs;
160
161
162
163 private String defaultProfile = null;
164
165 /***
166 * If true the crawler is 'running'. That is the next pending job will start
167 * crawling as soon as the current job (if any) is completed.
168 */
169 private boolean running = false;
170
171 /***
172 * String to indicate recovery should be based on the recovery log, not
173 * based on checkpointing.
174 */
175 public static final String RECOVER_LOG = "recover";
176
177 /***
178 * Jobs directory.
179 */
180 private final File jobsDir;
181
182 /***
183 * Constructor.
184 * @param jobsDir Jobs directory.
185 */
186 public CrawlJobHandler(final File jobsDir) {
187 this(jobsDir, true, true);
188 }
189
190 /***
191 * Constructor allowing for optional loading of profiles and jobs.
192 * @param jobsDir Jobs directory.
193 * @param loadJobs If true then any applicable jobs will be loaded.
194 * @param loadProfiles If true then any applicable profiles will be loaded.
195 */
196 public CrawlJobHandler(final File jobsDir,
197 final boolean loadJobs, final boolean loadProfiles) {
198 this.jobsDir = jobsDir;
199
200 Comparator<CrawlJob> comp = new Comparator<CrawlJob>(){
201 public int compare(CrawlJob job1, CrawlJob job2) {
202 if( job1.getJobPriority() < job2.getJobPriority() ){
203 return -1;
204 } else if( job1.getJobPriority() > job2.getJobPriority() ){
205 return 1;
206 } else {
207
208
209 return job1.getUID().compareTo(job2.getUID());
210 }
211 }
212 };
213 this.pendingCrawlJobs = new TreeSet<CrawlJob>(comp);
214 this.completedCrawlJobs = new TreeSet<CrawlJob>(comp);
215
216 this.profileJobs = new TreeSet<CrawlJob>(comp);
217 if (loadProfiles){
218 loadProfiles();
219 }
220 if (loadJobs){
221 loadJobs();
222 }
223 }
224
225 /***
226 * Find the state.job file in the job directory.
227 * @param jobDir Directory to look in.
228 * @return Full path to 'state.job' file or null if none found.
229 */
230 protected File getStateJobFile(final File jobDir) {
231
232 File[] jobFiles = jobDir.listFiles(new FilenameFilter() {
233 public boolean accept(File dir, String name) {
234 return name.toLowerCase().endsWith(".job") &&
235 (new File(dir, name)).canRead();
236 }
237
238 });
239 return (jobFiles.length == 1)? jobFiles[0]: null;
240 }
241
242 /***
243 * Loads any availible jobs in the jobs directory.
244 * <p>
245 * Availible jobs are any directory containing a file called
246 * <code>state.job</code>. The file must contain valid job information.
247 */
248 private void loadJobs() {
249 this.jobsDir.mkdirs();
250 File[] jobs = this.jobsDir.listFiles();
251 for (int i = 0; i < jobs.length; i++) {
252 if (jobs[i].isDirectory()) {
253 File jobFile = getStateJobFile(jobs[i]);
254 if (jobFile != null) {
255 loadJob(jobFile);
256 }
257 }
258 }
259 }
260
261 /***
262 * Loads a job given a specific job file. The loaded job will be placed in
263 * the list of completed jobs or pending queue depending on its status.
264 * Running jobs will have their status set to 'finished abnormally' and put
265 * into the completed list.
266 * @param job The job file of the job to load.
267 */
268 protected void loadJob(final File job) {
269 CrawlJob cjob = null;
270 try {
271
272 cjob = new CrawlJob(job, new CrawlJobErrorHandler());
273 } catch (InvalidJobFileException e) {
274 logger.log(Level.INFO,
275 "Invalid job file for " + job.getAbsolutePath(), e);
276 return;
277 } catch (IOException e) {
278 logger.log(Level.INFO, "IOException for " + job.getName() +
279 ", " + job.getAbsolutePath(), e);
280 return;
281 }
282
283
284
285 if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING)
286 || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED)
287 || cjob.getStatus().equals(CrawlJob.STATUS_CHECKPOINTING)
288 || cjob.getStatus().equals(CrawlJob.STATUS_WAITING_FOR_PAUSE) ){
289
290 cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL);
291 this.completedCrawlJobs.add(cjob);
292 } else if( cjob.getStatus().equals(CrawlJob.STATUS_PENDING) ) {
293
294 this.pendingCrawlJobs.add(cjob);
295 } else if( cjob.getStatus().equals(CrawlJob.STATUS_CREATED)
296 || cjob.getStatus().equals(CrawlJob.STATUS_DELETED) ) {
297
298 } else {
299
300 this.completedCrawlJobs.add(cjob);
301 }
302 }
303
304 /***
305 * Looks in conf dir for a profiles dir.
306 * @return the directory where profiles are stored else null if none
307 * available
308 * @throws IOException
309 */
310 private File getProfilesDirectory() throws IOException {
311 URL webappProfilePath = Heritrix.class.getResource("/" +
312 PROFILES_DIR_NAME);
313 if (webappProfilePath != null) {
314 try {
315 return new File(new URI(webappProfilePath.toString()));
316 } catch (java.lang.IllegalArgumentException e) {
317
318
319 } catch (java.net.URISyntaxException e) {
320 e.printStackTrace();
321 }
322 }
323 return (Heritrix.getConfdir(false) == null)? null:
324 new File(Heritrix.getConfdir().getAbsolutePath(),
325 PROFILES_DIR_NAME);
326 }
327
328 /***
329 * Loads the default profile and all other profiles found on disk.
330 */
331 private void loadProfiles() {
332 boolean loadedDefault = false;
333 File profileDir = null;
334 try {
335 profileDir = getProfilesDirectory();
336 } catch (IOException e) {
337 e.printStackTrace();
338 }
339 if (profileDir != null) {
340 File[] ps = profileDir.listFiles();
341 if (ps != null && ps.length > 0) {
342 for (int i = 0; i < ps.length; i++) {
343 File f = ps[i];
344 if (f.isDirectory()) {
345
346
347 File profile = new File(f, ORDER_FILE_NAME);
348 if (profile.canRead()) {
349 boolean b = loadProfile(profile);
350 if (b) {
351 loadedDefault = b;
352 }
353 }
354 }
355 }
356 }
357 }
358
359
360 String parent = File.separator + PROFILES_DIR_NAME + File.separator;
361 if (!loadedDefault) {
362 loadProfile(new File(parent + DEFAULT_PROFILE, ORDER_FILE_NAME));
363 }
364
365
366
367 defaultProfile = DEFAULT_PROFILE;
368 }
369
370 /***
371 * Load one profile.
372 * @param profile Profile to load.
373 * @return True if loaded profile was the default profile.
374 */
375 protected boolean loadProfile(File profile) {
376 boolean loadedDefault = false;
377
378 try {
379
380 XMLSettingsHandler newSettingsHandler =
381 new XMLSettingsHandler(profile);
382 CrawlJobErrorHandler cjseh =
383 new CrawlJobErrorHandler(Level.SEVERE);
384 newSettingsHandler.
385 setErrorReportingLevel(cjseh.getLevel());
386 newSettingsHandler.initialize();
387 addProfile(new CrawlJob(profile.getParentFile().getName(),
388 newSettingsHandler, cjseh));
389 loadedDefault = profile.getParentFile().getName().
390 equals(DEFAULT_PROFILE);
391 } catch (InvalidAttributeValueException e) {
392 System.err.println("Failed to load profile '" +
393 profile.getParentFile().getName() +
394 "'. InvalidAttributeValueException.");
395 }
396 return loadedDefault;
397 }
398
399 /***
400 * Add a new profile
401 * @param profile The new profile
402 */
403 public synchronized void addProfile(CrawlJob profile){
404 profileJobs.add(profile);
405 }
406
407 public synchronized void deleteProfile(CrawlJob cj) throws IOException {
408 File d = getProfilesDirectory();
409 File p = new File(d, cj.getJobName());
410 if (!p.exists()) {
411 throw new IOException("No profile named " + cj.getJobName() +
412 " at " + d.getAbsolutePath());
413 }
414 FileUtils.deleteDir(p);
415 this.profileJobs.remove(cj);
416 }
417
418 /***
419 * Returns a List of all known profiles.
420 * @return a List of all known profiles.
421 */
422 public synchronized List<CrawlJob> getProfiles(){
423 ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(profileJobs.size());
424 tmp.addAll(profileJobs);
425 return tmp;
426 }
427
428 /***
429 * Submit a job to the handler. Job will be scheduled for crawling. At
430 * present it will not take the job's priority into consideration.
431 *
432 * @param job A new job for the handler
433 * @return CrawlJob that was added or null.
434 */
435 public CrawlJob addJob(CrawlJob job) {
436 if(job.isProfile()){
437 return null;
438 }
439 job.setStatus(CrawlJob.STATUS_PENDING);
440 if(job.isNew()){
441
442 this.newJob = null;
443 job.setNew(false);
444 }
445 this.pendingCrawlJobs.add(job);
446 if(isCrawling() == false && isRunning()) {
447
448 startNextJob();
449 }
450 return job;
451 }
452
453 /***
454 * Returns the default profile. If no default profile has been set it will
455 * return the first profile that was set/loaded and still exists. If no
456 * profiles exist it will return null
457 * @return the default profile.
458 */
459 public synchronized CrawlJob getDefaultProfile() {
460 if(defaultProfile != null){
461 for(Iterator it = profileJobs.iterator(); it.hasNext();) {
462 CrawlJob item = (CrawlJob)it.next();
463 if(item.getJobName().equals(defaultProfile)){
464
465 return item;
466 }
467 }
468 }
469 if(profileJobs.size() > 0){
470 return (CrawlJob)profileJobs.first();
471 }
472 return null;
473 }
474
475 /***
476 * Set the default profile.
477 * @param profile The new default profile. The following must apply to it.
478 * profile.isProfile() should return true and
479 * this.getProfiles() should contain it.
480 */
481 public void setDefaultProfile(CrawlJob profile) {
482 defaultProfile = profile.getJobName();
483
484 }
485
486 /***
487 * A List of all pending jobs
488 *
489 * @return A List of all pending jobs.
490 * No promises are made about the order of the list
491 */
492 public List<CrawlJob> getPendingJobs() {
493 ArrayList<CrawlJob> tmp
494 = new ArrayList<CrawlJob>(pendingCrawlJobs.size());
495 tmp.addAll(pendingCrawlJobs);
496 return tmp;
497 }
498
499 /***
500 * @return The job currently being crawled.
501 */
502 public CrawlJob getCurrentJob() {
503 return currentJob;
504 }
505
506 /***
507 * @return A List of all finished jobs.
508 */
509 public List<CrawlJob> getCompletedJobs() {
510 ArrayList<CrawlJob> tmp
511 = new ArrayList<CrawlJob>(completedCrawlJobs.size());
512 tmp.addAll(completedCrawlJobs);
513 return tmp;
514 }
515
516 /***
517 * Return a job with the given UID.
518 * Doesn't matter if it's pending, currently running, has finished running
519 * is new or a profile.
520 *
521 * @param jobUID The unique ID of the job.
522 * @return The job with the UID or null if no such job is found
523 */
524 public CrawlJob getJob(String jobUID) {
525 if (jobUID == null){
526 return null;
527 }
528
529 if (currentJob != null && currentJob.getUID().equals(jobUID)) {
530 return currentJob;
531 } else if (newJob != null && newJob.getUID().equals(jobUID)) {
532
533 return newJob;
534 } else {
535
536 Iterator itPend = pendingCrawlJobs.iterator();
537 while (itPend.hasNext()) {
538 CrawlJob cj = (CrawlJob) itPend.next();
539 if (cj.getUID().equals(jobUID)) {
540 return cj;
541 }
542 }
543
544
545 Iterator itComp = completedCrawlJobs.iterator();
546 while (itComp.hasNext()) {
547 CrawlJob cj = (CrawlJob) itComp.next();
548 if (cj.getUID().equals(jobUID)) {
549 return cj;
550 }
551 }
552
553
554 for (Iterator i = getProfiles().iterator(); i.hasNext();) {
555 CrawlJob cj = (CrawlJob) i.next();
556 if (cj.getUID().equals(jobUID)) {
557 return cj;
558 }
559 }
560 }
561 return null;
562 }
563
564 /***
565 * @return True if we terminated a current job (False if no job to
566 * terminate)
567 */
568 public boolean terminateCurrentJob() {
569 if (this.currentJob == null) {
570 return false;
571 }
572
573
574 this.currentJob.stopCrawling();
575 synchronized (this) {
576 try {
577
578
579
580 wait(3000);
581 } catch (InterruptedException e) {
582
583 }
584 }
585 return true;
586 }
587
588 /***
589 * The specified job will be removed from the pending queue or aborted if
590 * currently running. It will be placed in the list of completed jobs with
591 * appropriate status info. If the job is already in the completed list or
592 * no job with the given UID is found, no action will be taken.
593 *
594 * @param jobUID The UID (unique ID) of the job that is to be deleted.
595 *
596 */
597 public void deleteJob(String jobUID) {
598
599 if (currentJob != null && jobUID.equals(currentJob.getUID())) {
600 terminateCurrentJob();
601 return;
602 }
603
604
605 for(Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) {
606 CrawlJob cj = (CrawlJob) it.next();
607 if (cj.getUID().equals(jobUID)) {
608
609 cj.setStatus(CrawlJob.STATUS_DELETED);
610 it.remove();
611 return;
612 }
613 }
614
615
616 for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) {
617 CrawlJob cj = (CrawlJob) it.next();
618 if (cj.getUID().equals(jobUID)) {
619
620 cj.setStatus(CrawlJob.STATUS_DELETED);
621 it.remove();
622 return;
623 }
624 }
625 }
626
627 /***
628 * Cause the current job to pause. If no current job is crawling this
629 * method will have no effect.
630 */
631 public void pauseJob() {
632 if (this.currentJob != null) {
633 this.currentJob.pause();
634 }
635 }
636
637 /***
638 * Cause the current job to resume crawling if it was paused. Will have no
639 * effect if the current job was not paused or if there is no current job.
640 * If the current job is still waiting to pause, this will not take effect
641 * until the job has actually paused. At which time it will immeditatly
642 * resume crawling.
643 */
644 public void resumeJob() {
645 if (this.currentJob != null) {
646 this.currentJob.resume();
647 }
648 }
649
650 /***
651 * Cause the current job to write a checkpoint to disk. Currently
652 * requires job to already be paused.
653 * @throws IllegalStateException Thrown if crawl is not paused.
654 */
655 public void checkpointJob() throws IllegalStateException {
656 if (this.currentJob != null) {
657 this.currentJob.checkpoint();
658 }
659 }
660
661 /***
662 * Returns a unique job ID.
663 * <p>
664 * No two calls to this method (on the same instance of this class) can ever
665 * return the same value. <br>
666 * Currently implemented to return a time stamp. That is subject to change
667 * though.
668 *
669 * @return A unique job ID.
670 *
671 * @see ArchiveUtils#TIMESTAMP17
672 */
673 public String getNextJobUID() {
674 return ArchiveUtils.get17DigitDate();
675 }
676
677 /***
678 * Creates a new job. The new job will be returned and also registered as
679 * the handler's 'new job'. The new job will be based on the settings
680 * provided but created in a new location on disk.
681 *
682 * @param baseOn
683 * A CrawlJob (with a valid settingshandler) to use as the
684 * template for the new job.
685 * @param recovery Whether to preinitialize new job as recovery of
686 * <code>baseOn</code> job. String holds RECOVER_LOG if we are to
687 * do the recovery based off the recover.gz log -- See RecoveryJournal in
688 * the frontier package -- or it holds the name of
689 * the checkpoint we're to use recoverying.
690 * @param name
691 * The name of the new job.
692 * @param description
693 * Descriptions of the job.
694 * @param seeds
695 * The contents of the new settings' seed file.
696 * @param priority
697 * The priority of the new job.
698 *
699 * @return The new crawl job.
700 * @throws FatalConfigurationException If a problem occurs creating the
701 * settings.
702 */
703 public CrawlJob newJob(CrawlJob baseOn, String recovery, String name,
704 String description, String seeds, int priority)
705 throws FatalConfigurationException {
706
707 File recover = null;
708 try {
709 if (recovery != null && recovery.length() > 0
710 && recovery.equals(RECOVER_LOG)) {
711
712
713 File dir = baseOn.getSettingsHandler().getOrder()
714 .getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
715
716
717 recover = new File(dir, FrontierJournal.LOGNAME_RECOVER);
718 } else if (recovery != null && recovery.length() > 0) {
719
720 recover = new File(baseOn.getSettingsHandler().
721 getOrder().getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH),
722 recovery);
723 }
724 } catch (AttributeNotFoundException e1) {
725 throw new FatalConfigurationException(
726 "AttributeNotFoundException occured while setting up" +
727 "new job/profile " + name + " \n" + e1.getMessage());
728 }
729
730 CrawlJob cj = createNewJob(baseOn.getSettingsHandler().getOrderFile(),
731 name, description, seeds, priority);
732
733 updateRecoveryPaths(recover, cj.getSettingsHandler(), name);
734
735 return cj;
736 }
737
738 /***
739 * Creates a new job. The new job will be returned and also registered as
740 * the handler's 'new job'. The new job will be based on the settings
741 * provided but created in a new location on disk.
742 * @param orderFile Order file to use as the template for the new job.
743 * @param name The name of the new job.
744 * @param description Descriptions of the job.
745 * @param seeds The contents of the new settings' seed file.
746 *
747 * @return The new crawl job.
748 * @throws FatalConfigurationException If a problem occurs creating the
749 * settings.
750 */
751 public CrawlJob newJob(final File orderFile, final String name,
752 final String description, final String seeds)
753 throws FatalConfigurationException {
754 return createNewJob(orderFile, name, description, seeds,
755 CrawlJob.PRIORITY_AVERAGE);
756 }
757
758 protected void checkDirectory(File dir)
759 throws FatalConfigurationException {
760 if (dir == null) {
761 return;
762 }
763 if (!dir.exists() && !dir.canRead()) {
764 throw new FatalConfigurationException(dir.getAbsolutePath() +
765 " does not exist or is unreadable");
766 }
767 }
768
769 protected CrawlJob createNewJob(final File orderFile, final String name,
770 final String description, final String seeds, final int priority)
771 throws FatalConfigurationException {
772 if (newJob != null) {
773
774 discardNewJob();
775 }
776 String UID = getNextJobUID();
777 File jobDir;
778 jobDir = new File(this.jobsDir, name + "-" + UID);
779 CrawlJobErrorHandler errorHandler = new CrawlJobErrorHandler();
780 XMLSettingsHandler handler =
781 createSettingsHandler(orderFile, name, description,
782 seeds, jobDir, errorHandler, "order.xml", "seeds.txt");
783 this.newJob = new CrawlJob(UID, name, handler, errorHandler, priority,
784 jobDir);
785 return this.newJob;
786 }
787
788 /***
789 * Creates a new profile. The new profile will be returned and also
790 * registered as the handler's 'new job'. The new profile will be based on
791 * the settings provided but created in a new location on disk.
792 *
793 * @param baseOn
794 * A CrawlJob (with a valid settingshandler) to use as the
795 * template for the new profile.
796 * @param name
797 * The name of the new profile.
798 * @param description
799 * Description of the new profile
800 * @param seeds
801 * The contents of the new profiles' seed file
802 * @return The new profile.
803 * @throws FatalConfigurationException
804 * @throws IOException
805 */
806 public CrawlJob newProfile(CrawlJob baseOn, String name, String description,
807 String seeds)
808 throws FatalConfigurationException, IOException {
809 File profileDir = new File(getProfilesDirectory().getAbsoluteFile()
810 + File.separator + name);
811 CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(Level.SEVERE);
812 CrawlJob newProfile = new CrawlJob(name,
813 createSettingsHandler(baseOn.getSettingsHandler().getOrderFile(),
814 name, description, seeds, profileDir, cjseh, "order.xml",
815 "seeds.txt"), cjseh);
816 addProfile(newProfile);
817 return newProfile;
818 }
819
820 /***
821 * Creates a new settings handler based on an existing job. Basically all
822 * the settings file for the 'based on' will be copied to the specified
823 * directory.
824 *
825 * @param orderFile Order file to base new order file on. Cannot be null.
826 * @param name Name for the new settings
827 * @param description Description of the new settings.
828 * @param seeds The contents of the new settings' seed file.
829 * @param newSettingsDir
830 * @param errorHandler
831 * @param filename Name of new order file.
832 * @param seedfile Name of new seeds file.
833 *
834 * @return The new settings handler.
835 * @throws FatalConfigurationException
836 * If there are problems with reading the 'base on'
837 * configuration, with writing the new configuration or it's
838 * seed file.
839 */
840 protected XMLSettingsHandler createSettingsHandler(
841 final File orderFile, final String name, final String description,
842 final String seeds, final File newSettingsDir,
843 final CrawlJobErrorHandler errorHandler,
844 final String filename, final String seedfile)
845 throws FatalConfigurationException {
846 XMLSettingsHandler newHandler = null;
847 try {
848 newHandler = new XMLSettingsHandler(orderFile);
849 if(errorHandler != null){
850 newHandler.registerValueErrorHandler(errorHandler);
851 }
852 newHandler.setErrorReportingLevel(errorHandler.getLevel());
853 newHandler.initialize();
854 } catch (InvalidAttributeValueException e2) {
855 throw new FatalConfigurationException(
856 "InvalidAttributeValueException occured while creating" +
857 " new settings handler for new job/profile\n" +
858 e2.getMessage());
859 }
860
861
862 newSettingsDir.mkdirs();
863
864 try {
865
866 ((ComplexType)newHandler.getOrder().getAttribute("scope"))
867 .setAttribute(new Attribute("seedsfile", seedfile));
868 } catch (AttributeNotFoundException e1) {
869 throw new FatalConfigurationException(
870 "AttributeNotFoundException occured while setting up" +
871 "new job/profile\n" + e1.getMessage());
872 } catch (InvalidAttributeValueException e1) {
873 throw new FatalConfigurationException(
874 "InvalidAttributeValueException occured while setting" +
875 "up new job/profile\n" + e1.getMessage());
876 } catch (MBeanException e1) {
877 throw new FatalConfigurationException(
878 "MBeanException occured while setting up new" +
879 " job/profile\n" + e1.getMessage());
880 } catch (ReflectionException e1) {
881 throw new FatalConfigurationException(
882 "ReflectionException occured while setting up" +
883 " new job/profile\n" + e1.getMessage());
884 }
885
886 File newFile = new File(newSettingsDir.getAbsolutePath(), filename);
887
888 try {
889 newHandler.copySettings(newFile, (String)newHandler.getOrder()
890 .getAttribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY));
891 } catch (IOException e3) {
892
893
894 e3.printStackTrace();
895 throw new FatalConfigurationException(
896 "IOException occured while writing new settings files" +
897 " for new job/profile\n" + e3.getMessage());
898 } catch (AttributeNotFoundException e) {
899 throw new FatalConfigurationException(
900 "AttributeNotFoundException occured while writing new" +
901 " settings files for new job/profile\n" + e.getMessage());
902 } catch (MBeanException e) {
903 throw new FatalConfigurationException(
904 "MBeanException occured while writing new settings files" +
905 " for new job/profile\n" + e.getMessage());
906 } catch (ReflectionException e) {
907 throw new FatalConfigurationException(
908 "ReflectionException occured while writing new settings" +
909 " files for new job/profile\n" + e.getMessage());
910 }
911 CrawlerSettings orderfile = newHandler.getSettingsObject(null);
912
913 orderfile.setName(name);
914 orderfile.setDescription(description);
915
916 if (seeds != null) {
917 BufferedWriter writer = null;
918 try {
919 writer = new BufferedWriter(new FileWriter(newHandler
920 .getPathRelativeToWorkingDirectory(seedfile)));
921 try {
922 writer.write(seeds);
923 } finally {
924 writer.close();
925 }
926 } catch (IOException e) {
927 throw new FatalConfigurationException(
928 "IOException occured while writing seed file for new"
929 + " job/profile\n" + e.getMessage());
930 }
931 }
932 return newHandler;
933 }
934
935 /***
936 * @param recover
937 * Source to use recovering. Can be full path to a recovery log
938 * or full path to a checkpoint src dir.
939 * @param sh
940 * Settings Handler to update.
941 * @param jobName
942 * Name of this job.
943 * @throws FatalConfigurationException
944 */
945 protected void updateRecoveryPaths(final File recover,
946 final SettingsHandler sh, final String jobName)
947 throws FatalConfigurationException {
948 if (recover == null) {
949 return;
950 }
951 checkDirectory(recover);
952 try {
953
954 updateRecoveryPaths(recover, sh);
955 } catch (AttributeNotFoundException e1) {
956 throw new FatalConfigurationException(
957 "AttributeNotFoundException occured while setting up"
958 + "new job/profile " + jobName + " \n"
959 + e1.getMessage());
960 } catch (InvalidAttributeValueException e1) {
961 throw new FatalConfigurationException(
962 "InvalidAttributeValueException occured while setting"
963 + "new job/profile " + jobName + " \n"
964 + e1.getMessage());
965 } catch (MBeanException e1) {
966 throw new FatalConfigurationException(
967 "MBeanException occured while setting up new"
968 + "new job/profile " + jobName + " \n"
969 + e1.getMessage());
970 } catch (ReflectionException e1) {
971 throw new FatalConfigurationException(
972 "ReflectionException occured while setting up"
973 + "new job/profile " + jobName + " \n"
974 + e1.getMessage());
975 } catch (IOException e) {
976 throw new FatalConfigurationException(
977 "IOException occured while setting up" + "new job/profile "
978 + jobName + " \n" + e.getMessage());
979 }
980 }
981
982 /***
983 * @param recover
984 * Source to use recovering. Can be full path to a recovery log
985 * or full path to a checkpoint src dir.
986 * @param newHandler
987 * @throws ReflectionException
988 * @throws MBeanException
989 * @throws InvalidAttributeValueException
990 * @throws AttributeNotFoundException
991 * @throws IOException
992 */
993 private void updateRecoveryPaths(final File recover,
994 SettingsHandler newHandler)
995 throws AttributeNotFoundException, InvalidAttributeValueException,
996 MBeanException, ReflectionException, IOException {
997 if (recover == null || !recover.exists()) {
998 throw new IOException("Recovery src does not exist: " + recover);
999 }
1000 newHandler.getOrder().setAttribute(
1001 new Attribute(CrawlOrder.ATTR_RECOVER_PATH,
1002 recover.getAbsolutePath()));
1003
1004
1005
1006 File newLogsDisk = null;
1007 final String RECOVERY_SUFFIX = "-R";
1008 while(true) {
1009 try {
1010 newLogsDisk = newHandler.getOrder().
1011 getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
1012 } catch (AttributeNotFoundException e) {
1013 logger.log(Level.SEVERE, "Failed to get logs directory", e);
1014 }
1015 if (newLogsDisk.list().length > 0) {
1016
1017 String logsPath = (String) newHandler.getOrder().
1018 getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1019 if(logsPath.endsWith("/")) {
1020 logsPath = logsPath.substring(0,logsPath.length()-1);
1021 }
1022 newHandler.getOrder().setAttribute(
1023 new Attribute(CrawlOrder.ATTR_LOGS_PATH,
1024 logsPath + RECOVERY_SUFFIX));
1025 } else {
1026
1027 break;
1028 }
1029 }
1030 File newStateDisk = null;
1031 while (true) {
1032 try {
1033 newStateDisk = newHandler.getOrder().getSettingsDir(
1034 CrawlOrder.ATTR_STATE_PATH);
1035 } catch (AttributeNotFoundException e) {
1036 logger.log(Level.SEVERE, "Failed to get state directory", e);
1037 }
1038 if (newStateDisk.list().length>0) {
1039
1040 String statePath = (String) newHandler.getOrder().
1041 getAttribute(CrawlOrder.ATTR_STATE_PATH);
1042 if(statePath.endsWith("/")) {
1043 statePath = statePath.substring(0,statePath.length()-1);
1044 }
1045 newHandler.getOrder().setAttribute(
1046 new Attribute(CrawlOrder.ATTR_STATE_PATH,
1047 statePath + RECOVERY_SUFFIX));
1048 } else {
1049
1050 break;
1051 }
1052 }
1053 }
1054
1055 /***
1056 * Discard the handler's 'new job'. This will remove any files/directories
1057 * written to disk.
1058 */
1059 public void discardNewJob(){
1060 FileUtils.deleteDir(new File(newJob.getSettingsDirectory()));
1061 }
1062
1063 /***
1064 * Get the handler's 'new job'
1065 * @return the handler's 'new job'
1066 */
1067 public CrawlJob getNewJob(){
1068 return newJob;
1069 }
1070
1071 /***
1072 * Is the crawler accepting crawl jobs to run?
1073 * @return True if the next availible CrawlJob will be crawled. False otherwise.
1074 */
1075 public boolean isRunning() {
1076 return running;
1077 }
1078
1079 /***
1080 * Is a crawl job being crawled?
1081 * @return True if a job is actually being crawled (even if it is paused).
1082 * False if no job is being crawled.
1083 */
1084 public boolean isCrawling() {
1085 return this.currentJob != null;
1086 }
1087
1088 /***
1089 * Allow jobs to be crawled.
1090 */
1091 public void startCrawler() {
1092 running = true;
1093 if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
1094
1095 startNextJob();
1096 }
1097 }
1098
1099 /***
1100 * Stop future jobs from being crawled.
1101 *
1102 * This action will not affect the current job.
1103 */
1104 public void stopCrawler() {
1105 running = false;
1106 }
1107
1108 /***
1109 * Start next crawl job.
1110 *
1111 * If a is job already running this method will do nothing.
1112 */
1113 protected final void startNextJob() {
1114 synchronized (this) {
1115 if(startingNextJob != null) {
1116 try {
1117 startingNextJob.join();
1118 } catch (InterruptedException e) {
1119 e.printStackTrace();
1120 return;
1121 }
1122 }
1123 startingNextJob = new Thread(new Runnable() {
1124 public void run() {
1125 startNextJobInternal();
1126 }
1127 }, "StartNextJob");
1128 startingNextJob.start();
1129 }
1130 }
1131
1132 protected void startNextJobInternal() {
1133 if (pendingCrawlJobs.size() == 0 || isCrawling()) {
1134
1135 return;
1136 }
1137 this.currentJob = (CrawlJob)pendingCrawlJobs.first();
1138 assert pendingCrawlJobs.contains(currentJob) :
1139 "pendingCrawlJobs is in an illegal state";
1140 pendingCrawlJobs.remove(currentJob);
1141 try {
1142 this.currentJob.setupForCrawlStart();
1143
1144
1145
1146 this.currentJob.getController().addCrawlStatusListener(this);
1147
1148 this.currentJob.getController().requestCrawlStart();
1149 } catch (InitializationException e) {
1150 loadJob(getStateJobFile(this.currentJob.getDirectory()));
1151 this.currentJob = null;
1152 startNextJobInternal();
1153 }
1154 }
1155
1156 /***
1157 * Forward a 'kick' update to current job if any.
1158 */
1159 public void kickUpdate() {
1160 if(this.currentJob != null) {
1161 this.currentJob.kickUpdate();
1162 }
1163 }
1164
1165 /***
1166 * Loads options from a file. Typically these are a list of available
1167 * modules that can be plugged into some part of the configuration.
1168 * For examples Processors, Frontiers, Filters etc. Leading and trailing
1169 * spaces are trimmed from each line.
1170 *
1171 * <p>Options are loaded from the CLASSPATH.
1172 * @param file the name of the option file (without path!)
1173 * @return The option file with each option line as a seperate entry in the
1174 * ArrayList.
1175 * @throws IOException when there is trouble reading the file.
1176 */
1177 public static ArrayList<String> loadOptions(String file)
1178 throws IOException {
1179 ArrayList<String> ret = new ArrayList<String>();
1180 Enumeration resources =
1181 CrawlJob.class.getClassLoader().getResources("modules/" + file);
1182
1183 boolean noFileFound = true;
1184 while (resources.hasMoreElements()) {
1185 InputStream is = ((URL) resources.nextElement()).openStream();
1186 noFileFound = false;
1187
1188 String line = null;
1189 BufferedReader bf =
1190 new BufferedReader(new InputStreamReader(is), 8192);
1191 try {
1192 while ((line = bf.readLine()) != null) {
1193 line = line.trim();
1194 if(line.indexOf('#')<0 && line.length()>0){
1195
1196 ret.add(line);
1197 }
1198 }
1199 } finally {
1200 bf.close();
1201 }
1202 }
1203
1204 if (noFileFound) {
1205 throw new IOException("Failed to get " + file + " from the " +
1206 " CLASSPATH");
1207 }
1208
1209 return ret;
1210 }
1211
1212 /***
1213 * Returns a URIFrontierMarker for the current, paused, job. If there is no
1214 * current job or it is not paused null will be returned.
1215 *
1216 * @param regexpr
1217 * A regular expression that each URI must match in order to be
1218 * considered 'within' the marker.
1219 * @param inCacheOnly
1220 * Limit marker scope to 'cached' URIs.
1221 * @return a URIFrontierMarker for the current job.
1222 * @see #getPendingURIsList(FrontierMarker, int, boolean)
1223 * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
1224 * boolean)
1225 * @see org.archive.crawler.framework.FrontierMarker
1226 */
1227 public FrontierMarker getInitialMarker(String regexpr,
1228 boolean inCacheOnly) {
1229 return (this.currentJob != null)?
1230 this.currentJob.getInitialMarker(regexpr, inCacheOnly): null;
1231 }
1232
1233 /***
1234 * Returns the frontiers URI list based on the provided marker. This method
1235 * will return null if there is not current job or if the current job is
1236 * not paused. Only when there is a paused current job will this method
1237 * return a URI list.
1238 *
1239 * @param marker
1240 * URIFrontier marker
1241 * @param numberOfMatches
1242 * maximum number of matches to return
1243 * @param verbose
1244 * should detailed info be provided on each URI?
1245 * @return the frontiers URI list based on the provided marker
1246 * @throws InvalidFrontierMarkerException
1247 * When marker is inconsistent with the current state of the
1248 * frontier.
1249 * @see #getInitialMarker(String, boolean)
1250 * @see org.archive.crawler.framework.FrontierMarker
1251 */
1252 public ArrayList getPendingURIsList(FrontierMarker marker,
1253 int numberOfMatches, boolean verbose)
1254 throws InvalidFrontierMarkerException {
1255 return (this.currentJob != null)?
1256 this.currentJob.getPendingURIsList(marker, numberOfMatches, verbose):
1257 null;
1258 }
1259
1260 /***
1261 * Delete any URI from the frontier of the current (paused) job that match
1262 * the specified regular expression. If the current job is not paused (or
1263 * there is no current job) nothing will be done.
1264 * @param regexpr Regular expression to delete URIs by.
1265 * @return the number of URIs deleted
1266 */
1267 public long deleteURIsFromPending(String regexpr) {
1268 return deleteURIsFromPending(regexpr, null);
1269 }
1270
1271 /***
1272 * Delete any URI from the frontier of the current (paused) job that match
1273 * the specified regular expression. If the current job is not paused (or
1274 * there is no current job) nothing will be done.
1275 * @param uriPattern Regular expression to delete URIs by.
1276 * @param queuePattern Regular expression of target queues (or null for all)
1277 * @return the number of URIs deleted
1278 */
1279 public long deleteURIsFromPending(String uriPattern, String queuePattern) {
1280 return (this.currentJob != null)?
1281 this.currentJob.deleteURIsFromPending(uriPattern,queuePattern): 0;
1282 }
1283
1284 public String importUris(String file, String style, String force) {
1285 return importUris(file, style, "true".equals(force));
1286 }
1287
1288 /***
1289 * @param fileOrUrl Name of file w/ seeds.
1290 * @param style What style of seeds -- crawl log (<code>crawlLog</code>
1291 * style) or recovery journal (<code>recoveryJournal</code> style), or
1292 * seeds file style (Pass <code>default</code> style).
1293 * @param forceRevisit Should we revisit even if seen before?
1294 * @return A display string that has a count of all added.
1295 */
1296 public String importUris(final String fileOrUrl, final String style,
1297 final boolean forceRevisit) {
1298 return (this.currentJob != null)?
1299 this.currentJob.importUris(fileOrUrl, style, forceRevisit): null;
1300 }
1301
1302 protected int importUris(InputStream is, String style,
1303 boolean forceRevisit) {
1304 return (this.currentJob != null)?
1305 this.currentJob.importUris(is, style, forceRevisit): 0;
1306 }
1307
1308 /***
1309 * Schedule a uri.
1310 * @param uri Uri to schedule.
1311 * @param forceFetch Should it be forcefetched.
1312 * @param isSeed True if seed.
1313 * @throws URIException
1314 */
1315 public void importUri(final String uri, final boolean forceFetch,
1316 final boolean isSeed)
1317 throws URIException {
1318 importUri(uri, forceFetch, isSeed, true);
1319 }
1320
1321 /***
1322 * Schedule a uri.
1323 * @param str String that can be: 1. a UURI, 2. a snippet of the
1324 * crawl.log line, or 3. a snippet from recover log. See
1325 * {@link #importUris(InputStream, String, boolean)} for how it subparses
1326 * the lines from crawl.log and recover.log.
1327 * @param forceFetch Should it be forcefetched.
1328 * @param isSeed True if seed.
1329 * @param isFlush If true, flush the frontier IF it implements
1330 * flushing.
1331 * @throws URIException
1332 */
1333 public void importUri(final String str, final boolean forceFetch,
1334 final boolean isSeed, final boolean isFlush)
1335 throws URIException {
1336 if (this.currentJob != null) {
1337 this.currentJob.importUri(str, forceFetch, isSeed, isFlush);
1338 }
1339 }
1340
1341 /***
1342 * If its a HostQueuesFrontier, needs to be flushed for the queued.
1343 */
1344 protected void doFlush() {
1345 if (this.currentJob != null) {
1346 this.currentJob.flush();
1347 }
1348 }
1349
1350 public void stop() {
1351 if (isCrawling()) {
1352 deleteJob(getCurrentJob().getUID());
1353 }
1354 }
1355
1356 public void requestCrawlStop() {
1357 if (this.currentJob != null) {
1358 this.currentJob.stopCrawling();
1359 }
1360 }
1361
1362 /***
1363 * Ensure order file with new name/desc is written.
1364 * See '[ 1066573 ] sometimes job based-on other job uses older job name'.
1365 * @param newJob Newly created job.
1366 * @param metaname Metaname for new job.
1367 * @param description Description for new job.
1368 * @return <code>newJob</code>
1369 */
1370 public static CrawlJob ensureNewJobWritten(CrawlJob newJob, String metaname,
1371 String description) {
1372 XMLSettingsHandler settingsHandler = newJob.getSettingsHandler();
1373 CrawlerSettings orderfile = settingsHandler.getSettingsObject(null);
1374 orderfile.setName(metaname);
1375 orderfile.setDescription(description);
1376 settingsHandler.writeSettingsObject(orderfile);
1377 return newJob;
1378 }
1379
1380 public void crawlStarted(String message) {
1381
1382
1383 }
1384
1385 public void crawlEnding(String sExitMessage) {
1386 loadJob(getStateJobFile(this.currentJob.getDirectory()));
1387 currentJob = null;
1388 synchronized (this) {
1389
1390 notifyAll();
1391 }
1392 }
1393
1394 public void crawlEnded(String sExitMessage) {
1395 if (this.running) {
1396 startNextJob();
1397 }
1398 }
1399
1400 public void crawlPausing(String statusMessage) {
1401
1402
1403 }
1404
1405 public void crawlPaused(String statusMessage) {
1406
1407
1408 }
1409
1410 public void crawlResuming(String statusMessage) {
1411
1412 }
1413
1414 public void crawlCheckpoint(File checkpointDir) throws Exception {
1415
1416 }
1417 }