1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.IOException;
27 import java.io.ObjectInputStream;
28 import java.io.ObjectOutputStream;
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.HashSet;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.Set;
35 import java.util.concurrent.CopyOnWriteArrayList;
36
37 import org.apache.commons.httpclient.HttpStatus;
38 import org.apache.commons.httpclient.URIException;
39 import org.archive.crawler.datamodel.credential.CredentialAvatar;
40 import org.archive.crawler.datamodel.credential.Rfc2617Credential;
41 import org.archive.crawler.extractor.Link;
42 import org.archive.crawler.framework.Processor;
43 import org.archive.crawler.framework.ProcessorChain;
44 import org.archive.crawler.util.Transform;
45 import org.archive.net.UURI;
46 import org.archive.net.UURIFactory;
47 import org.archive.util.ArchiveUtils;
48 import org.archive.util.Base32;
49 import org.archive.util.HttpRecorder;
50
51 import st.ata.util.AList;
52 import st.ata.util.HashtableAList;
53
54
55 /***
56 * Represents a candidate URI and the associated state it
57 * collects as it is crawled.
58 *
59 * <p>Core state is in instance variables but a flexible
60 * attribute list is also available. Use this 'bucket' to carry
61 * custom processing extracted data and state across CrawlURI
62 * processing. See the {@link #putString(String, String)},
63 * {@link #getString(String)}, etc.
64 *
65 * @author Gordon Mohr
66 */
67 public class CrawlURI extends CandidateURI
68 implements FetchStatusCodes {
69
70 private static final long serialVersionUID = 7874096757350100472L;
71
72 public static final int UNCALCULATED = -1;
73
74
75
76
77
78
79
80
81
82 transient private Processor nextProcessor;
83 transient private ProcessorChain nextProcessorChain;
84 private int fetchStatus = 0;
85 private int deferrals = 0;
86 private int fetchAttempts = 0;
87 transient private int threadNumber;
88
89
90 /*** @deprecated */
91 private int linkHopCount = UNCALCULATED;
92 /*** @deprecated */
93 private int embedHopCount = UNCALCULATED;
94
95
96 private String userAgent = null;
97
98
99
100 transient private boolean linkExtractorFinished = false;
101
102 /***
103 * Protection against outlink overflow.
104 * Change value by setting alternate maximum in heritrix.properties.
105 */
106 public static final int MAX_OUTLINKS = Integer.
107 parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",
108 "6000"));
109
110 transient private int discardedOutlinks = 0;
111
112
113 private long contentSize = UNCALCULATED;
114 private long contentLength = UNCALCULATED;
115
116 /***
117 * Current http recorder.
118 *
119 * Gets set upon successful request. Reset at start of processing chain.
120 */
121 private transient HttpRecorder httpRecorder = null;
122
123 /***
124 * Content type of a successfully fetched URI.
125 *
126 * May be null even on successfully fetched URI.
127 */
128 private String contentType = null;
129
130 /***
131 * True if this CrawlURI has been deemed a prerequisite by the
132 * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
133 *
134 * This flag is used at least inside in the precondition enforcer so that
135 * subsequent prerequisite tests know to let this CrawlURI through because
136 * its a prerequisite needed by an earlier prerequisite tests (e.g. If
137 * this is a robots.txt, then the subsequent login credentials prereq
138 * test must not throw it out because its not a login curi).
139 */
140 private boolean prerequisite = false;
141
142 /***
143 * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
144 */
145 private boolean post = false;
146
147 /***
148 * Monotonically increasing number within a crawl;
149 * useful for tending towards breadth-first ordering.
150 * Will sometimes be truncated to 48 bits, so behavior
151 * over 281 trillion instantiated CrawlURIs may be
152 * buggy
153 */
154 protected long ordinal;
155
156 /***
157 * Cache of this candidate uuri as a string.
158 *
159 * Profiling shows us spending about 1-2% of total elapsed time in
160 * toString.
161 */
162 private String cachedCrawlURIString = null;
163
164 /***
165 * Array to hold keys of alist members that persist across URI processings.
166 * Any key mentioned in this list will not be cleared out at the end
167 * of a pass down the processing chain.
168 */
169 private static final List<Object> alistPersistentMember
170 = new CopyOnWriteArrayList<Object>(
171 new String [] {A_CREDENTIAL_AVATARS_KEY});
172
173 /***
174 * A digest (hash, usually SHA1) of retrieved content-body.
175 *
176 */
177 private byte[] contentDigest = null;
178 private String contentDigestScheme = null;
179
180
181 /***
182 * Create a new instance of CrawlURI from a {@link UURI}.
183 *
184 * @param uuri the UURI to base this CrawlURI on.
185 */
186 public CrawlURI(UURI uuri) {
187 super(uuri);
188 }
189
190 /***
191 * Create a new instance of CrawlURI from a {@link CandidateURI}
192 *
193 * @param caUri the CandidateURI to base this CrawlURI on.
194 * @param o Monotonically increasing number within a crawl.
195 */
196 @SuppressWarnings("deprecation")
197 public CrawlURI(CandidateURI caUri, long o) {
198 super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
199 caUri.getViaContext());
200 ordinal = o;
201 setIsSeed(caUri.isSeed());
202 setSchedulingDirective(caUri.getSchedulingDirective());
203 setAList(caUri.getAList());
204 }
205
206 /***
207 * Takes a status code and converts it into a human readable string.
208 *
209 * @param code the status code
210 * @return a human readable string declaring what the status code is.
211 */
212 public static String fetchStatusCodesToString(int code){
213 switch(code){
214
215 case S_DNS_SUCCESS : return "DNS-1-OK";
216
217 case 100 : return "HTTP-100-Info-Continue";
218 case 101 : return "HTTP-101-Info-Switching Protocols";
219
220 case 200 : return "HTTP-200-Success-OK";
221 case 201 : return "HTTP-201-Success-Created";
222 case 202 : return "HTTP-202-Success-Accepted";
223 case 203 : return "HTTP-203-Success-Non-Authoritative";
224 case 204 : return "HTTP-204-Success-No Content ";
225 case 205 : return "HTTP-205-Success-Reset Content";
226 case 206 : return "HTTP-206-Success-Partial Content";
227
228 case 300 : return "HTTP-300-Redirect-Multiple Choices";
229 case 301 : return "HTTP-301-Redirect-Moved Permanently";
230 case 302 : return "HTTP-302-Redirect-Found";
231 case 303 : return "HTTP-303-Redirect-See Other";
232 case 304 : return "HTTP-304-Redirect-Not Modified";
233 case 305 : return "HTTP-305-Redirect-Use Proxy";
234 case 307 : return "HTTP-307-Redirect-Temporary Redirect";
235
236 case 400 : return "HTTP-400-ClientErr-Bad Request";
237 case 401 : return "HTTP-401-ClientErr-Unauthorized";
238 case 402 : return "HTTP-402-ClientErr-Payment Required";
239 case 403 : return "HTTP-403-ClientErr-Forbidden";
240 case 404 : return "HTTP-404-ClientErr-Not Found";
241 case 405 : return "HTTP-405-ClientErr-Method Not Allowed";
242 case 407 : return "HTTP-406-ClientErr-Not Acceptable";
243 case 408 : return "HTTP-407-ClientErr-Proxy Authentication Required";
244 case 409 : return "HTTP-408-ClientErr-Request Timeout";
245 case 410 : return "HTTP-409-ClientErr-Conflict";
246 case 406 : return "HTTP-410-ClientErr-Gone";
247 case 411 : return "HTTP-411-ClientErr-Length Required";
248 case 412 : return "HTTP-412-ClientErr-Precondition Failed";
249 case 413 : return "HTTP-413-ClientErr-Request Entity Too Large";
250 case 414 : return "HTTP-414-ClientErr-Request-URI Too Long";
251 case 415 : return "HTTP-415-ClientErr-Unsupported Media Type";
252 case 416 : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
253 case 417 : return "HTTP-417-ClientErr-Expectation Failed";
254
255 case 500 : return "HTTP-500-ServerErr-Internal Server Error";
256 case 501 : return "HTTP-501-ServerErr-Not Implemented";
257 case 502 : return "HTTP-502-ServerErr-Bad Gateway";
258 case 503 : return "HTTP-503-ServerErr-Service Unavailable";
259 case 504 : return "HTTP-504-ServerErr-Gateway Timeout";
260 case 505 : return "HTTP-505-ServerErr-HTTP Version Not Supported";
261
262 case S_BLOCKED_BY_USER:
263 return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
264 case S_BLOCKED_BY_CUSTOM_PROCESSOR:
265 return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
266 ")-Blocked by custom prefetch processor";
267 case S_DELETED_BY_USER:
268 return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
269 case S_CONNECT_FAILED:
270 return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
271 case S_CONNECT_LOST:
272 return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
273 case S_DEEMED_CHAFF:
274 return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
275 case S_DEFERRED:
276 return "Heritrix(" + S_DEFERRED + ")-Deferred";
277 case S_DOMAIN_UNRESOLVABLE:
278 return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
279 + ")-Domain unresolvable";
280 case S_OUT_OF_SCOPE:
281 return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
282 case S_DOMAIN_PREREQUISITE_FAILURE:
283 return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
284 + ")-Domain prerequisite failure";
285 case S_ROBOTS_PREREQUISITE_FAILURE:
286 return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
287 + ")-Robots prerequisite failure";
288 case S_OTHER_PREREQUISITE_FAILURE:
289 return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
290 + ")-Other prerequisite failure";
291 case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
292 return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
293 + ")-Prerequisite unschedulable failure";
294 case S_ROBOTS_PRECLUDED:
295 return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
296 case S_RUNTIME_EXCEPTION:
297 return "Heritrix(" + S_RUNTIME_EXCEPTION
298 + ")-Runtime exception";
299 case S_SERIOUS_ERROR:
300 return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
301 case S_TIMEOUT:
302 return "Heritrix(" + S_TIMEOUT + ")-Timeout";
303 case S_TOO_MANY_EMBED_HOPS:
304 return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
305 + ")-Too many embed hops";
306 case S_TOO_MANY_LINK_HOPS:
307 return "Heritrix(" + S_TOO_MANY_LINK_HOPS
308 + ")-Too many link hops";
309 case S_TOO_MANY_RETRIES:
310 return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
311 case S_UNATTEMPTED:
312 return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
313 case S_UNFETCHABLE_URI:
314 return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
315 case S_PROCESSING_THREAD_KILLED:
316 return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
317 "Processing thread killed";
318
319 default : return Integer.toString(code);
320 }
321 }
322
323
324 /***
325 * Return the overall/fetch status of this CrawlURI for its
326 * current trip through the processing loop.
327 *
328 * @return a value from FetchStatusCodes
329 */
330 public int getFetchStatus(){
331 return fetchStatus;
332 }
333
334 /***
335 * Set the overall/fetch status of this CrawlURI for
336 * its current trip through the processing loop.
337 *
338 * @param newstatus a value from FetchStatusCodes
339 */
340 public void setFetchStatus(int newstatus){
341 fetchStatus = newstatus;
342 }
343
344 /***
345 * Get the number of attempts at getting the document referenced by this
346 * URI.
347 *
348 * @return the number of attempts at getting the document referenced by this
349 * URI.
350 */
351 public int getFetchAttempts() {
352 return fetchAttempts;
353 }
354
355 /***
356 * Increment the number of attempts at getting the document referenced by
357 * this URI.
358 *
359 * @return the number of attempts at getting the document referenced by this
360 * URI.
361 */
362 public int incrementFetchAttempts() {
363
364 return fetchAttempts++;
365 }
366
367 /***
368 * Reset fetchAttempts counter.
369 */
370 public void resetFetchAttempts() {
371 this.fetchAttempts = 0;
372 }
373
374 /***
375 * Reset deferrals counter.
376 */
377 public void resetDeferrals() {
378 this.deferrals = 0;
379 }
380
381 /***
382 * Get the next processor to process this URI.
383 *
384 * @return the processor that should process this URI next.
385 */
386 public Processor nextProcessor() {
387 return nextProcessor;
388 }
389
390 /***
391 * Get the processor chain that should be processing this URI after the
392 * current chain is finished with it.
393 *
394 * @return the next processor chain to process this URI.
395 */
396 public ProcessorChain nextProcessorChain() {
397 return nextProcessorChain;
398 }
399
400 /***
401 * Set the next processor to process this URI.
402 *
403 * @param processor the next processor to process this URI.
404 */
405 public void setNextProcessor(Processor processor) {
406 nextProcessor = processor;
407 }
408
409 /***
410 * Set the next processor chain to process this URI.
411 *
412 * @param nextProcessorChain the next processor chain to process this URI.
413 */
414 public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
415 this.nextProcessorChain = nextProcessorChain;
416 }
417
418 /***
419 * Do all actions associated with setting a <code>CrawlURI</code> as
420 * requiring a prerequisite.
421 *
422 * @param lastProcessorChain Last processor chain reference. This chain is
423 * where this <code>CrawlURI</code> goes next.
424 * @param preq Object to set a prerequisite.
425 * @throws URIException
426 */
427 public void markPrerequisite(String preq,
428 ProcessorChain lastProcessorChain) throws URIException {
429 Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);
430 setPrerequisiteUri(link);
431 incrementDeferrals();
432 setFetchStatus(S_DEFERRED);
433 skipToProcessorChain(lastProcessorChain);
434 }
435
436 /***
437 * Set a prerequisite for this URI.
438 * <p>
439 * A prerequisite is a URI that must be crawled before this URI can be
440 * crawled.
441 *
442 * @param link Link to set as prereq.
443 */
444 public void setPrerequisiteUri(Object link) {
445 putObject(A_PREREQUISITE_URI, link);
446 }
447
448 /***
449 * Get the prerequisite for this URI.
450 * <p>
451 * A prerequisite is a URI that must be crawled before this URI can be
452 * crawled.
453 *
454 * @return the prerequisite for this URI or null if no prerequisite.
455 */
456 public Object getPrerequisiteUri() {
457 return getObject(A_PREREQUISITE_URI);
458 }
459
460 /***
461 * @return True if this CrawlURI has a prerequisite.
462 */
463 public boolean hasPrerequisiteUri() {
464 return containsKey(A_PREREQUISITE_URI);
465 }
466
467 /***
468 * Returns true if this CrawlURI is a prerequisite.
469 *
470 * @return true if this CrawlURI is a prerequisite.
471 */
472 public boolean isPrerequisite() {
473 return this.prerequisite;
474 }
475
476 /***
477 * Set if this CrawlURI is itself a prerequisite URI.
478 *
479 * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
480 */
481 public void setPrerequisite(boolean prerequisite) {
482 this.prerequisite = prerequisite;
483 }
484
485 /***
486 * @return This crawl URI as a string wrapped with 'CrawlURI(' +
487 * ')'.
488 */
489 public String getCrawlURIString() {
490 if (this.cachedCrawlURIString == null) {
491 synchronized (this) {
492 if (this.cachedCrawlURIString == null) {
493 this.cachedCrawlURIString =
494 "CrawlURI(" + toString() + ")";
495 }
496 }
497 }
498 return this.cachedCrawlURIString;
499 }
500
501 /***
502 * Get the content type of this URI.
503 *
504 * @return Fetched URIs content type. May be null.
505 */
506 public String getContentType() {
507 return this.contentType;
508 }
509
510 /***
511 * Set a fetched uri's content type.
512 *
513 * @param ct Contenttype. May be null.
514 */
515 public void setContentType(String ct) {
516 this.contentType = ct;
517 }
518
519 /***
520 * Set the number of the ToeThread responsible for processing this uri.
521 *
522 * @param i the ToeThread number.
523 */
524 public void setThreadNumber(int i) {
525 threadNumber = i;
526 }
527
528 /***
529 * Get the number of the ToeThread responsible for processing this uri.
530 *
531 * @return the ToeThread number.
532 */
533 public int getThreadNumber() {
534 return threadNumber;
535 }
536
537 /***
538 * Increment the deferral count.
539 *
540 */
541 public void incrementDeferrals() {
542 deferrals++;
543 }
544
545 /***
546 * Get the deferral count.
547 *
548 * @return the deferral count.
549 */
550 public int getDeferrals() {
551 return deferrals;
552 }
553
554 /***
555 * Remove all attributes set on this uri.
556 * <p>
557 * This methods removes the attribute list.
558 */
559 public void stripToMinimal() {
560 clearAList();
561 }
562
563 /***
564 * Get the size in bytes of this URI's recorded content, inclusive
565 * of things like protocol headers. It is the responsibility of the
566 * classes which fetch the URI to set this value accordingly -- it is
567 * not calculated/verified within CrawlURI.
568 *
569 * This value is consulted in reporting/logging/writing-decisions.
570 *
571 * @see #setContentSize()
572 * @return contentSize
573 */
574 public long getContentSize(){
575 return contentSize;
576 }
577
578 /***
579 * Make note of a non-fatal error, local to a particular Processor,
580 * which should be logged somewhere, but allows processing to continue.
581 *
582 * This is how you add to the local-error log (the 'localized' in
583 * the below is making an error local rather than global, not
584 * making a swiss-french version of the error.).
585 *
586 * @param processorName Name of processor the exception was thrown
587 * in.
588 * @param ex Throwable to log.
589 * @param message Extra message to log beyond exception message.
590 */
591 public void addLocalizedError(final String processorName,
592 final Throwable ex, final String message) {
593 List<LocalizedError> localizedErrors;
594 if (containsKey(A_LOCALIZED_ERRORS)) {
595 @SuppressWarnings("unchecked")
596 List<LocalizedError> temp
597 = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
598 localizedErrors = temp;
599 } else {
600 localizedErrors = new ArrayList<LocalizedError>();
601 putObject(A_LOCALIZED_ERRORS, localizedErrors);
602 }
603
604 localizedErrors.add(new LocalizedError(processorName, ex, message));
605 addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +
606 processorName);
607 }
608
609
610 protected String getClassSimpleName(final Class c) {
611 String classname = c.getName();
612 int index = classname.lastIndexOf('.');
613 return ((index > 0 && (index + 1) < classname.length())?
614 classname.substring(index + 1): classname);
615 }
616
617 /***
618 * Add an annotation: an abbrieviated indication of something special
619 * about this URI that need not be present in every crawl.log line,
620 * but should be noted for future reference.
621 *
622 * @param annotation the annotation to add; should not contain
623 * whitespace or a comma
624 */
625 public void addAnnotation(String annotation) {
626 String annotations;
627 if(containsKey(A_ANNOTATIONS)) {
628 annotations = getString(A_ANNOTATIONS);
629 annotations += ","+annotation;
630 } else {
631 annotations = annotation;
632 }
633
634 putString(A_ANNOTATIONS,annotations);
635 }
636
637 /***
638 * TODO: Implement truncation using booleans rather than as this
639 * ugly String parse.
640 * @return True if fetch was truncated.
641 */
642 public boolean isTruncatedFetch() {
643 return annotationContains(TRUNC_SUFFIX);
644 }
645
646 public boolean isLengthTruncatedFetch() {
647 return annotationContains(LENGTH_TRUNC);
648 }
649
650 public boolean isTimeTruncatedFetch() {
651 return annotationContains(TIMER_TRUNC);
652 }
653
654 public boolean isHeaderTruncatedFetch() {
655 return annotationContains(HEADER_TRUNC);
656 }
657
658 protected boolean annotationContains(final String str2Find) {
659 boolean result = false;
660 if (!containsKey(A_ANNOTATIONS)) {
661 return result;
662 }
663 String annotations = getString(A_ANNOTATIONS);
664 if (annotations != null && annotations.length() > 0) {
665 result = annotations.indexOf(str2Find) >= 0;
666 }
667 return result;
668 }
669
670 /***
671 * Get the annotations set for this uri.
672 *
673 * @return the annotations set for this uri.
674 */
675 public String getAnnotations() {
676 return (containsKey(A_ANNOTATIONS))?
677 getString(A_ANNOTATIONS): null;
678 }
679
680 /***
681 * Get the embeded hop count.
682 *
683 * @return the embeded hop count.
684 * @deprecated
685 */
686 public int getEmbedHopCount() {
687 return embedHopCount;
688 }
689
690 /***
691 * Get the link hop count.
692 *
693 * @return the link hop count.
694 * @deprecated
695 */
696 public int getLinkHopCount() {
697 return linkHopCount;
698 }
699
700 /***
701 * Mark this uri as being a seed.
702 *
703 *
704 * @deprecated
705 */
706 public void markAsSeed() {
707 linkHopCount = 0;
708 embedHopCount = 0;
709 }
710
711 /***
712 * Get the user agent to use for crawling this URI.
713 *
714 * If null the global setting should be used.
715 *
716 * @return user agent or null
717 */
718 public String getUserAgent() {
719 return userAgent;
720 }
721
722 /***
723 * Set the user agent to use when crawling this URI.
724 *
725 * If not set the global settings should be used.
726 *
727 * @param string user agent to use
728 */
729 public void setUserAgent(String string) {
730 userAgent = string;
731 }
732
733 /***
734 * Set which processor should be the next processor to process this uri
735 * instead of using the default next processor.
736 *
737 * @param processorChain the processor chain to skip to.
738 * @param processor the processor in the processor chain to skip to.
739 */
740 public void skipToProcessor(ProcessorChain processorChain,
741 Processor processor) {
742 setNextProcessorChain(processorChain);
743 setNextProcessor(processor);
744 }
745
746 /***
747 * Set which processor chain should be processing this uri next.
748 *
749 * @param processorChain the processor chain to skip to.
750 */
751 public void skipToProcessorChain(ProcessorChain processorChain) {
752 setNextProcessorChain(processorChain);
753 setNextProcessor(null);
754 }
755
756 /***
757 * For completed HTTP transactions, the length of the content-body.
758 *
759 * @return For completed HTTP transactions, the length of the content-body.
760 */
761 public long getContentLength() {
762 if (this.contentLength < 0) {
763 this.contentLength = (getHttpRecorder() != null)?
764 getHttpRecorder().getResponseContentLength(): 0;
765 }
766 return this.contentLength;
767 }
768
769 /***
770 * Get size of data recorded (transferred)
771 *
772 * @return recorded data size
773 */
774 public long getRecordedSize() {
775 return (getHttpRecorder() != null)
776 ? getHttpRecorder().getRecordedInput().getSize()
777
778 : getContentSize();
779 }
780
781 /***
782 * Sets the 'content size' for the URI, which is considered inclusive
783 * of all recorded material (such as protocol headers) or even material
784 * 'virtually' considered (as in material from a previous fetch
785 * confirmed unchanged with a server). (In contrast, content-length
786 * matches the HTTP definition, that of the enclosed content-body.)
787 *
788 * Should be set by a fetcher or other processor as soon as the final
789 * size of recorded content is known. Setting to an artificial/incorrect
790 * value may affect other reporting/processing.
791 *
792 * @param l Content size.
793 */
794 public void setContentSize(long l) {
795 contentSize = l;
796 }
797
798 /***
799 * If true then a link extractor has already claimed this CrawlURI and
800 * performed link extraction on the document content. This does not
801 * preclude other link extractors that may have an interest in this
802 * CrawlURI from also doing link extraction but default behavior should
803 * be to not run if link extraction has already been done.
804 *
805 * <p>There is an onus on link extractors to set this flag if they have
806 * run.
807 *
808 * <p>The only extractor of the default Heritrix set that does not
809 * respect this flag is
810 * {@link org.archive.crawler.extractor.ExtractorHTTP}.
811 * It runs against HTTP headers, not the document content.
812 *
813 * @return True if a processor has performed link extraction on this
814 * CrawlURI
815 *
816 * @see #linkExtractorFinished()
817 */
818 public boolean hasBeenLinkExtracted(){
819 return linkExtractorFinished;
820 }
821
822 /***
823 * Note that link extraction has been performed on this CrawlURI. A processor
824 * doing link extraction should invoke this method once it has finished it's
825 * work. It should invoke it even if no links are extracted. It should only
826 * invoke this method if the link extraction was performed on the document
827 * body (not the HTTP headers etc.).
828 *
829 * @see #hasBeenLinkExtracted()
830 */
831 public void linkExtractorFinished() {
832 linkExtractorFinished = true;
833 if(discardedOutlinks>0) {
834 addAnnotation("dol:"+discardedOutlinks);
835 }
836 }
837
838 /***
839 * Notify CrawlURI it is about to be logged; opportunity
840 * for self-annotation
841 */
842 public void aboutToLog() {
843 if (fetchAttempts>1) {
844 addAnnotation(fetchAttempts+"t");
845 }
846 }
847
848 /***
849 * Get the http recorder associated with this uri.
850 *
851 * @return Returns the httpRecorder. May be null but its set early in
852 * FetchHttp so there is an issue if its null.
853 */
854 public HttpRecorder getHttpRecorder() {
855 return httpRecorder;
856 }
857
858 /***
859 * Set the http recorder to be associated with this uri.
860 *
861 * @param httpRecorder The httpRecorder to set.
862 */
863 public void setHttpRecorder(HttpRecorder httpRecorder) {
864 this.httpRecorder = httpRecorder;
865 }
866
867 /***
868 * Return true if this is a http transaction.
869 *
870 * TODO: Compound this and {@link #isPost()} method so that there is one
871 * place to go to find out if get http, post http, ftp, dns.
872 *
873 * @return True if this is a http transaction.
874 */
875 public boolean isHttpTransaction() {
876 return containsKey(A_HTTP_TRANSACTION);
877 }
878
879 /***
880 * Clean up after a run through the processing chain.
881 *
882 * Called on the end of processing chain by Frontier#finish. Null out any
883 * state gathered during processing.
884 */
885 public void processingCleanup() {
886 this.httpRecorder = null;
887 this.fetchStatus = S_UNATTEMPTED;
888 this.setPrerequisite(false);
889 this.contentSize = UNCALCULATED;
890 this.contentLength = UNCALCULATED;
891
892 this.linkExtractorFinished = false;
893
894 setAList(getPersistentAList());
895 }
896
897 public AList getPersistentAList() {
898 AList newAList = new HashtableAList();
899
900 if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {
901 newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());
902 }
903
904 List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
905 if(heritableKeys!=null) {
906 newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
907 }
908 return newAList;
909 }
910
911 /***
912 * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
913 *
914 * Its safe to pass a CrawlURI instance. In this case we just return it
915 * as a result. Otherwise, we create new CrawlURI instance.
916 *
917 * @param caUri Candidate URI.
918 * @param ordinal
919 * @return A crawlURI made from the passed CandidateURI.
920 */
921 public static CrawlURI from(CandidateURI caUri, long ordinal) {
922 return (caUri instanceof CrawlURI)?
923 (CrawlURI)caUri: new CrawlURI(caUri, ordinal);
924 }
925
926 /***
927 * @param avatars Credential avatars to save off.
928 */
929 private void setCredentialAvatars(Set avatars) {
930 putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
931 }
932
933 /***
934 * @return Credential avatars. Null if none set.
935 */
936 @SuppressWarnings("unchecked")
937 public Set<CredentialAvatar> getCredentialAvatars() {
938 return (Set)getObject(A_CREDENTIAL_AVATARS_KEY);
939 }
940
941 /***
942 * @return True if there are avatars attached to this instance.
943 */
944 public boolean hasCredentialAvatars() {
945 return getCredentialAvatars() != null &&
946 getCredentialAvatars().size() > 0;
947 }
948
949 /***
950 * Add an avatar.
951 *
952 * We do lazy instantiation.
953 *
954 * @param ca Credential avatar to add to set of avatars.
955 */
956 public void addCredentialAvatar(CredentialAvatar ca) {
957 Set<CredentialAvatar> avatars = getCredentialAvatars();
958 if (avatars == null) {
959 avatars = new HashSet<CredentialAvatar>();
960 setCredentialAvatars(avatars);
961 }
962 avatars.add(ca);
963 }
964
965 /***
966 * Remove all credential avatars from this crawl uri.
967 */
968 public void removeCredentialAvatars() {
969 if (hasCredentialAvatars()) {
970 remove(A_CREDENTIAL_AVATARS_KEY);
971 }
972 }
973
974 /***
975 * Remove all credential avatars from this crawl uri.
976 * @param ca Avatar to remove.
977 * @return True if we removed passed parameter. False if no operation
978 * performed.
979 */
980 public boolean removeCredentialAvatar(CredentialAvatar ca) {
981 boolean result = false;
982 Set avatars = getCredentialAvatars();
983 if (avatars != null && avatars.size() > 0) {
984 result = avatars.remove(ca);
985 }
986 return result;
987 }
988
989 /***
990 * Ask this URI if it was a success or not.
991 *
992 * Only makes sense to call this method after execution of
993 * HttpMethod#execute. Regard any status larger then 0 as success
994 * except for below caveat regarding 401s. Use {@link #is2XXSuccess()} if
995 * looking for a status code in the 200 range.
996 *
997 * <p>401s caveat: If any rfc2617 credential data present and we got a 401
998 * assume it got loaded in FetchHTTP on expectation that we're to go around
999 * the processing chain again. Report this condition as a failure so we
1000 * get another crack at the processing chain only this time we'll be making
1001 * use of the loaded credential data.
1002 *
1003 * @return True if ths URI has been successfully processed.
1004 * @see #is2XXSuccess()
1005 */
1006 public boolean isSuccess() {
1007 boolean result = false;
1008 int statusCode = this.fetchStatus;
1009 if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
1010 hasRfc2617CredentialAvatar()) {
1011 result = false;
1012 } else {
1013 result = (statusCode > 0);
1014 }
1015 return result;
1016 }
1017
1018 /***
1019 * @return True if status code is in the 2xx range.
1020 * @see #isSuccess()
1021 */
1022 public boolean is2XXSuccess() {
1023 return this.fetchStatus >= 200 && this.fetchStatus < 300;
1024 }
1025
1026 /***
1027 * @return True if we have an rfc2617 payload.
1028 */
1029 public boolean hasRfc2617CredentialAvatar() {
1030 boolean result = false;
1031 Set avatars = getCredentialAvatars();
1032 if (avatars != null && avatars.size() > 0) {
1033 for (Iterator i = avatars.iterator(); i.hasNext();) {
1034 if (((CredentialAvatar)i.next()).
1035 match(Rfc2617Credential.class)) {
1036 result = true;
1037 break;
1038 }
1039 }
1040 }
1041 return result;
1042 }
1043
1044 /***
1045 * Set whether this URI should be fetched by sending a HTTP POST request.
1046 * Else a HTTP GET request will be used.
1047 *
1048 * @param b Set whether this curi is to be POST'd. Else its to be GET'd.
1049 */
1050 public void setPost(boolean b) {
1051 this.post = b;
1052 }
1053
1054 /***
1055 * Returns true if this URI should be fetched by sending a HTTP POST request.
1056 *
1057 *
1058 * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1059 * is one place to go to find out if get http, post http, ftp, dns.
1060 *
1061 * @return Returns is this CrawlURI instance is to be posted.
1062 */
1063 public boolean isPost() {
1064 return this.post;
1065 }
1066
1067 /***
1068 * Set the retained content-digest value (usu. SHA1).
1069 *
1070 * @param digestValue
1071 * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1072 */
1073 public void setContentDigest(byte[] digestValue) {
1074 setContentDigest("SHA1", digestValue);
1075 }
1076
1077 public void setContentDigest(final String scheme,
1078 final byte [] digestValue) {
1079 this.contentDigest = digestValue;
1080 this.contentDigestScheme = scheme;
1081 }
1082
1083 public String getContentDigestSchemeString() {
1084 if(this.contentDigest==null) {
1085 return null;
1086 }
1087 return this.contentDigestScheme + ":" + getContentDigestString();
1088 }
1089
1090 /***
1091 * Return the retained content-digest value, if any.
1092 *
1093 * @return Digest value.
1094 */
1095 public Object getContentDigest() {
1096 return contentDigest;
1097 }
1098
1099 public String getContentDigestString() {
1100 if(this.contentDigest==null) {
1101 return null;
1102 }
1103 return Base32.encode(this.contentDigest);
1104 }
1105
1106 transient Object holder;
1107 transient Object holderKey;
1108
1109 /***
1110 * Remember a 'holder' to which some enclosing/queueing
1111 * facility has assigned this CrawlURI
1112 * .
1113 * @param obj
1114 */
1115 public void setHolder(Object obj) {
1116 holder=obj;
1117 }
1118
1119 /***
1120 * Return the 'holder' for the convenience of
1121 * an external facility.
1122 *
1123 * @return holder
1124 */
1125 public Object getHolder() {
1126 return holder;
1127 }
1128
1129 /***
1130 * Remember a 'holderKey' which some enclosing/queueing
1131 * facility has assigned this CrawlURI
1132 * .
1133 * @param obj
1134 */
1135 public void setHolderKey(Object obj) {
1136 holderKey=obj;
1137 }
1138 /***
1139 * Return the 'holderKey' for convenience of
1140 * an external facility (Frontier).
1141 *
1142 * @return holderKey
1143 */
1144 public Object getHolderKey() {
1145 return holderKey;
1146 }
1147
1148 /***
1149 * Get the ordinal (serial number) assigned at creation.
1150 *
1151 * @return ordinal
1152 */
1153 public long getOrdinal() {
1154 return ordinal;
1155 }
1156
1157 /*** spot for an integer cost to be placed by external facility (frontier).
1158 * cost is truncated to 8 bits at times, so should not exceed 255 */
1159 int holderCost = UNCALCULATED;
1160 /***
1161 * Return the 'holderCost' for convenience of external facility (frontier)
1162 * @return value of holderCost
1163 */
1164 public int getHolderCost() {
1165 return holderCost;
1166 }
1167
1168 /***
1169 * Remember a 'holderCost' which some enclosing/queueing
1170 * facility has assigned this CrawlURI
1171 * @param cost value to remember
1172 */
1173 public void setHolderCost(int cost) {
1174 holderCost = cost;
1175 }
1176
1177 /***
1178 * All discovered outbound Links (navlinks, embeds, etc.)
1179 * Can either contain Link instances or CandidateURI instances, or both.
1180 * The LinksScoper processor converts Link instances in this collection
1181 * to CandidateURI instances.
1182 */
1183 transient Collection<Object> outLinks = new HashSet<Object>();
1184
1185 /***
1186 * Returns discovered links. The returned collection might be empty if
1187 * no links were discovered, or if something like LinksScoper promoted
1188 * the links to CandidateURIs.
1189 *
1190 * Elements can be removed from the returned collection, but not added.
1191 * To add a discovered link, use one of the createAndAdd methods or
1192 * {@link #getOutObjects()}.
1193 *
1194 * @return Collection of all discovered outbound Links
1195 */
1196 public Collection<Link> getOutLinks() {
1197 return Transform.subclasses(outLinks, Link.class);
1198 }
1199
1200 /***
1201 * Returns discovered candidate URIs. The returned collection will be
1202 * emtpy until something like LinksScoper promotes discovered Links
1203 * into CandidateURIs.
1204 *
1205 * Elements can be removed from the returned collection, but not added.
1206 * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1207 * {@link #getOutObjects}.
1208 *
1209 * @return Collection of candidate URIs
1210 */
1211 public Collection<CandidateURI> getOutCandidates() {
1212 return Transform.subclasses(outLinks, CandidateURI.class);
1213 }
1214
1215
1216 /***
1217 * Returns all of the outbound objects. The returned Collection will
1218 * contain Link instances, or CandidateURI instances, or both.
1219 *
1220 * @return the collection of Links and/or CandidateURIs
1221 */
1222 public Collection<Object> getOutObjects() {
1223 return outLinks;
1224 }
1225
1226 /***
1227 * Add a discovered Link, unless it would exceed the max number
1228 * to accept. (If so, increment discarded link counter.)
1229 *
1230 * @param link the Link to add
1231 */
1232 public void addOutLink(Link link) {
1233 if (outLinks.size() < MAX_OUTLINKS) {
1234 outLinks.add(link);
1235 } else {
1236
1237 discardedOutlinks++;
1238 }
1239 }
1240
1241 public void clearOutlinks() {
1242 this.outLinks.clear();
1243 }
1244
1245 /***
1246 * Replace current collection of links w/ passed list.
1247 * Used by Scopers adjusting the list of links (removing those
1248 * not in scope and promoting Links to CandidateURIs).
1249 *
1250 * @param a collection of CandidateURIs replacing any previously
1251 * existing outLinks or outCandidates
1252 */
1253 public void replaceOutlinks(Collection<CandidateURI> links) {
1254 clearOutlinks();
1255 this.outLinks.addAll(links);
1256 }
1257
1258
1259 /***
1260 * @return Count of outlinks.
1261 */
1262 public int outlinksSize() {
1263 return this.outLinks.size();
1264 }
1265
1266 /***
1267 * Convenience method for creating a Link discovered at this URI
1268 * with the given string and context
1269 *
1270 * @param url
1271 * String to use to create Link
1272 * @param context
1273 * CharSequence context to use
1274 * @param hopType
1275 * @return Link.
1276 * @throws URIException
1277 * if Link UURI cannot be constructed
1278 */
1279 public Link createLink(String url, CharSequence context,
1280 char hopType) throws URIException {
1281 return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1282 url), context, hopType);
1283 }
1284
1285 /***
1286 * Convenience method for creating a Link with the given string and
1287 * context
1288 *
1289 * @param url
1290 * String to use to create Link
1291 * @param context
1292 * CharSequence context to use
1293 * @param hopType
1294 * @throws URIException
1295 * if Link UURI cannot be constructed
1296 */
1297 public void createAndAddLink(String url, CharSequence context,
1298 char hopType) throws URIException {
1299 addOutLink(createLink(url, context, hopType));
1300 }
1301
1302 /***
1303 * Convenience method for creating a Link with the given string and
1304 * context, relative to a previously set base HREF if available (or
1305 * relative to the current CrawlURI if no other base has been set)
1306 *
1307 * @param url String URL to add as destination of link
1308 * @param context String context where link was discovered
1309 * @param hopType char hop-type indicator
1310 * @throws URIException
1311 */
1312 public void createAndAddLinkRelativeToBase(String url,
1313 CharSequence context, char hopType) throws URIException {
1314 addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1315 getBaseURI(), url), context, hopType));
1316 }
1317
1318 /***
1319 * Convenience method for creating a Link with the given string and
1320 * context, relative to this CrawlURI's via UURI if available. (If
1321 * a via is not available, falls back to using
1322 * #createAndAddLinkRelativeToBase.)
1323 *
1324 * @param url String URL to add as destination of link
1325 * @param context String context where link was discovered
1326 * @param hopType char hop-type indicator
1327 * @throws URIException
1328 */
1329 public void createAndAddLinkRelativeToVia(String url,
1330 CharSequence context, char hopType) throws URIException {
1331 if(getVia()!=null) {
1332 addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1333 getVia(), url), context, hopType));
1334 } else {
1335
1336 createAndAddLinkRelativeToBase(url,context,hopType);
1337 }
1338 }
1339
1340 /***
1341 * Set the (HTML) Base URI used for derelativizing internal URIs.
1342 *
1343 * @param baseHref String base href to use
1344 * @throws URIException if supplied string cannot be interpreted as URI
1345 */
1346 public void setBaseURI(String baseHref) throws URIException {
1347 putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1348 }
1349
1350 /***
1351 * Get the (HTML) Base URI used for derelativizing internal URIs.
1352 *
1353 * @return UURI base URI previously set
1354 */
1355 public UURI getBaseURI() {
1356 if (!containsKey(A_HTML_BASE)) {
1357 return getUURI();
1358 }
1359 return (UURI)getObject(A_HTML_BASE);
1360 }
1361
1362 /***
1363 * Add the key of alist items you want to persist across
1364 * processings.
1365 * @param key Key to add.
1366 */
1367 public static void addAlistPersistentMember(Object key) {
1368 alistPersistentMember.add(key);
1369 }
1370
1371 /***
1372 * @param key Key to remove.
1373 * @return True if list contained the element.
1374 */
1375 public static boolean removeAlistPersistentMember(Object key) {
1376 return alistPersistentMember.remove(key);
1377 }
1378
1379 /***
1380 * Custom serialization writing an empty 'outLinks' as null. Estimated
1381 * to save ~20 bytes in serialized form.
1382 *
1383 * @param stream
1384 * @throws IOException
1385 */
1386 private void writeObject(ObjectOutputStream stream) throws IOException {
1387 stream.defaultWriteObject();
1388 stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1389 }
1390
1391 /***
1392 * Custom deserialization recreating empty HashSet from null in 'outLinks'
1393 * slot.
1394 *
1395 * @param stream
1396 * @throws IOException
1397 * @throws ClassNotFoundException
1398 */
1399 private void readObject(ObjectInputStream stream) throws IOException,
1400 ClassNotFoundException {
1401 stream.defaultReadObject();
1402 @SuppressWarnings("unchecked")
1403 HashSet<Object> ol = (HashSet<Object>) stream.readObject();
1404 outLinks = (ol == null) ? new HashSet<Object>() : ol;
1405 }
1406
1407 public long getFetchDuration() {
1408 if(! containsKey(A_FETCH_COMPLETED_TIME)) {
1409 return -1;
1410 }
1411
1412 long completedTime = getLong(A_FETCH_COMPLETED_TIME);
1413 long beganTime = getLong(A_FETCH_BEGAN_TIME);
1414 return completedTime - beganTime;
1415 }
1416
1417
1418 }