View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlURI.java
20   * Created on Apr 16, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.IOException;
27  import java.io.ObjectInputStream;
28  import java.io.ObjectOutputStream;
29  import java.util.ArrayList;
30  import java.util.Collection;
31  import java.util.HashSet;
32  import java.util.Iterator;
33  import java.util.List;
34  import java.util.Set;
35  import java.util.concurrent.CopyOnWriteArrayList;
36  
37  import org.apache.commons.httpclient.HttpStatus;
38  import org.apache.commons.httpclient.URIException;
39  import org.archive.crawler.datamodel.credential.CredentialAvatar;
40  import org.archive.crawler.datamodel.credential.Rfc2617Credential;
41  import org.archive.crawler.extractor.Link;
42  import org.archive.crawler.framework.Processor;
43  import org.archive.crawler.framework.ProcessorChain;
44  import org.archive.crawler.util.Transform;
45  import org.archive.net.UURI;
46  import org.archive.net.UURIFactory;
47  import org.archive.util.ArchiveUtils;
48  import org.archive.util.Base32;
49  import org.archive.util.HttpRecorder;
50  
51  import st.ata.util.AList;
52  import st.ata.util.HashtableAList;
53  
54  
55  /***
56   * Represents a candidate URI and the associated state it
57   * collects as it is crawled.
58   *
59   * <p>Core state is in instance variables but a flexible
60   * attribute list is also available. Use this 'bucket' to carry
61   * custom processing extracted data and state across CrawlURI
62   * processing.  See the {@link #putString(String, String)},
63   * {@link #getString(String)}, etc. 
64   *
65   * @author Gordon Mohr
66   */
67  public class CrawlURI extends CandidateURI
68  implements FetchStatusCodes {
69  
70      private static final long serialVersionUID = 7874096757350100472L;
71  
72      public static final int UNCALCULATED = -1;
73      
74      // INHERITED FROM CANDIDATEURI
75      // uuri: core identity: the "usable URI" to be crawled
76      // isSeed
77      // inScopeVersion
78      // pathFromSeed
79      // via
80  
81      // Processing progress
82      transient private Processor nextProcessor;
83      transient private ProcessorChain nextProcessorChain;
84      private int fetchStatus = 0;    // default to unattempted
85      private int deferrals = 0;     // count of postponements for prerequisites
86      private int fetchAttempts = 0; // the number of fetch attempts that have been made
87      transient private int threadNumber;
88  
89      // dynamic context
90      /*** @deprecated */
91      private int linkHopCount = UNCALCULATED; // from seeds
92      /*** @deprecated */
93      private int embedHopCount = UNCALCULATED; // from a sure link; reset upon any link traversal
94  
95      // User agent to masquerade as when crawling this URI. If null, globals should be used
96      private String userAgent = null;
97  
98      // Once a link extractor has finished processing this curi this will be
99      // set as true
100     transient private boolean linkExtractorFinished = false;
101 
102     /***
103      * Protection against outlink overflow.
104      * Change value by setting alternate maximum in heritrix.properties.
105      */
106     public static final int MAX_OUTLINKS = Integer.
107         parseInt(System.getProperty(CrawlURI.class.getName() + ".maxOutLinks",
108             "6000"));
109     
110     transient private int discardedOutlinks = 0; 
111     
112 ////////////////////////////////////////////////////////////////////
113     private long contentSize = UNCALCULATED;
114     private long contentLength = UNCALCULATED;
115 
116     /***
117      * Current http recorder.
118      *
119      * Gets set upon successful request.  Reset at start of processing chain.
120      */
121     private transient HttpRecorder httpRecorder = null;
122 
123     /***
124      * Content type of a successfully fetched URI.
125      *
126      * May be null even on successfully fetched URI.
127      */
128     private String contentType = null;
129 
130     /***
131      * True if this CrawlURI has been deemed a prerequisite by the
132      * {@link org.archive.crawler.prefetch.PreconditionEnforcer}.
133      *
134      * This flag is used at least inside in the precondition enforcer so that
135      * subsequent prerequisite tests know to let this CrawlURI through because
136      * its a prerequisite needed by an earlier prerequisite tests (e.g. If
137      * this is a robots.txt, then the subsequent login credentials prereq
138      * test must not throw it out because its not a login curi).
139      */
140     private boolean prerequisite = false;
141 
142     /***
143      * Set to true if this <code>curi</code> is to be POST'd rather than GET-d.
144      */
145     private boolean post = false;
146 
147     /*** 
148      * Monotonically increasing number within a crawl;
149      * useful for tending towards breadth-first ordering.
150      * Will sometimes be truncated to 48 bits, so behavior
151      * over 281 trillion instantiated CrawlURIs may be 
152      * buggy
153      */
154     protected long ordinal;
155 
156     /***
157      * Cache of this candidate uuri as a string.
158      *
159      * Profiling shows us spending about 1-2% of total elapsed time in
160      * toString.
161      */
162     private String cachedCrawlURIString = null;
163     
164     /***
165      * Array to hold keys of alist members that persist across URI processings.
166      * Any key mentioned in this list will not be cleared out at the end
167      * of a pass down the processing chain.
168      */
169     private static final List<Object> alistPersistentMember
170      = new CopyOnWriteArrayList<Object>(
171             new String [] {A_CREDENTIAL_AVATARS_KEY});
172 
173     /***
174      * A digest (hash, usually SHA1) of retrieved content-body. 
175      * 
176      */
177     private byte[] contentDigest = null;
178     private String contentDigestScheme = null;
179 
180 
181     /***
182      * Create a new instance of CrawlURI from a {@link UURI}.
183      *
184      * @param uuri the UURI to base this CrawlURI on.
185      */
186     public CrawlURI(UURI uuri) {
187         super(uuri);
188     }
189 
190     /***
191      * Create a new instance of CrawlURI from a {@link CandidateURI}
192      *
193      * @param caUri the CandidateURI to base this CrawlURI on.
194      * @param o Monotonically increasing number within a crawl.
195      */
196     @SuppressWarnings("deprecation")
197     public CrawlURI(CandidateURI caUri, long o) {
198         super(caUri.getUURI(), caUri.getPathFromSeed(), caUri.getVia(),
199             caUri.getViaContext());
200         ordinal = o;
201         setIsSeed(caUri.isSeed());
202         setSchedulingDirective(caUri.getSchedulingDirective());
203         setAList(caUri.getAList());
204     }
205 
206     /***
207      * Takes a status code and converts it into a human readable string.
208      *
209      * @param code the status code
210      * @return a human readable string declaring what the status code is.
211      */
212     public static String fetchStatusCodesToString(int code){
213         switch(code){
214             // DNS
215             case S_DNS_SUCCESS : return "DNS-1-OK";
216             // HTTP Informational 1xx
217             case 100  : return "HTTP-100-Info-Continue";
218             case 101  : return "HTTP-101-Info-Switching Protocols";
219             // HTTP Successful 2xx
220             case 200  : return "HTTP-200-Success-OK";
221             case 201  : return "HTTP-201-Success-Created";
222             case 202  : return "HTTP-202-Success-Accepted";
223             case 203  : return "HTTP-203-Success-Non-Authoritative";
224             case 204  : return "HTTP-204-Success-No Content ";
225             case 205  : return "HTTP-205-Success-Reset Content";
226             case 206  : return "HTTP-206-Success-Partial Content";
227             // HTTP Redirection 3xx
228             case 300  : return "HTTP-300-Redirect-Multiple Choices";
229             case 301  : return "HTTP-301-Redirect-Moved Permanently";
230             case 302  : return "HTTP-302-Redirect-Found";
231             case 303  : return "HTTP-303-Redirect-See Other";
232             case 304  : return "HTTP-304-Redirect-Not Modified";
233             case 305  : return "HTTP-305-Redirect-Use Proxy";
234             case 307  : return "HTTP-307-Redirect-Temporary Redirect";
235             // HTTP Client Error 4xx
236             case 400  : return "HTTP-400-ClientErr-Bad Request";
237             case 401  : return "HTTP-401-ClientErr-Unauthorized";
238             case 402  : return "HTTP-402-ClientErr-Payment Required";
239             case 403  : return "HTTP-403-ClientErr-Forbidden";
240             case 404  : return "HTTP-404-ClientErr-Not Found";
241             case 405  : return "HTTP-405-ClientErr-Method Not Allowed";
242             case 407  : return "HTTP-406-ClientErr-Not Acceptable";
243             case 408  : return "HTTP-407-ClientErr-Proxy Authentication Required";
244             case 409  : return "HTTP-408-ClientErr-Request Timeout";
245             case 410  : return "HTTP-409-ClientErr-Conflict";
246             case 406  : return "HTTP-410-ClientErr-Gone";
247             case 411  : return "HTTP-411-ClientErr-Length Required";
248             case 412  : return "HTTP-412-ClientErr-Precondition Failed";
249             case 413  : return "HTTP-413-ClientErr-Request Entity Too Large";
250             case 414  : return "HTTP-414-ClientErr-Request-URI Too Long";
251             case 415  : return "HTTP-415-ClientErr-Unsupported Media Type";
252             case 416  : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
253             case 417  : return "HTTP-417-ClientErr-Expectation Failed";
254             // HTTP Server Error 5xx
255             case 500  : return "HTTP-500-ServerErr-Internal Server Error";
256             case 501  : return "HTTP-501-ServerErr-Not Implemented";
257             case 502  : return "HTTP-502-ServerErr-Bad Gateway";
258             case 503  : return "HTTP-503-ServerErr-Service Unavailable";
259             case 504  : return "HTTP-504-ServerErr-Gateway Timeout";
260             case 505  : return "HTTP-505-ServerErr-HTTP Version Not Supported";
261             // Heritrix internal codes (all negative numbers
262             case S_BLOCKED_BY_USER:
263                 return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
264             case S_BLOCKED_BY_CUSTOM_PROCESSOR:
265                 return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
266                 ")-Blocked by custom prefetch processor";
267             case S_DELETED_BY_USER:
268                 return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
269             case S_CONNECT_FAILED:
270                 return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
271             case S_CONNECT_LOST:
272                 return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
273             case S_DEEMED_CHAFF:
274                 return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
275             case S_DEFERRED:
276                 return "Heritrix(" + S_DEFERRED + ")-Deferred";
277             case S_DOMAIN_UNRESOLVABLE:
278                 return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
279                         + ")-Domain unresolvable";
280             case S_OUT_OF_SCOPE:
281                 return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
282             case S_DOMAIN_PREREQUISITE_FAILURE:
283                 return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
284                         + ")-Domain prerequisite failure";
285             case S_ROBOTS_PREREQUISITE_FAILURE:
286                 return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
287                         + ")-Robots prerequisite failure";
288             case S_OTHER_PREREQUISITE_FAILURE:
289                 return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
290                         + ")-Other prerequisite failure";
291             case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
292                 return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
293                         + ")-Prerequisite unschedulable failure";
294             case S_ROBOTS_PRECLUDED:
295                 return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
296             case S_RUNTIME_EXCEPTION:
297                 return "Heritrix(" + S_RUNTIME_EXCEPTION
298                         + ")-Runtime exception";
299             case S_SERIOUS_ERROR:
300                 return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
301             case S_TIMEOUT:
302                 return "Heritrix(" + S_TIMEOUT + ")-Timeout";
303             case S_TOO_MANY_EMBED_HOPS:
304                 return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
305                         + ")-Too many embed hops";
306             case S_TOO_MANY_LINK_HOPS:
307                 return "Heritrix(" + S_TOO_MANY_LINK_HOPS
308                         + ")-Too many link hops";
309             case S_TOO_MANY_RETRIES:
310                 return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
311             case S_UNATTEMPTED:
312                 return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
313             case S_UNFETCHABLE_URI:
314                 return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
315             case S_PROCESSING_THREAD_KILLED:
316                 return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
317                     "Processing thread killed";
318             // Unknown return code
319             default : return Integer.toString(code);
320         }
321     }
322 
323 
324     /***
325      * Return the overall/fetch status of this CrawlURI for its
326      * current trip through the processing loop.
327      *
328      * @return a value from FetchStatusCodes
329      */
330     public int getFetchStatus(){
331         return fetchStatus;
332     }
333 
334     /***
335      * Set the overall/fetch status of this CrawlURI for
336      * its current trip through the processing loop.
337      *
338      * @param newstatus a value from FetchStatusCodes
339      */
340     public void setFetchStatus(int newstatus){
341         fetchStatus = newstatus;
342     }
343 
344     /***
345      * Get the number of attempts at getting the document referenced by this
346      * URI.
347      *
348      * @return the number of attempts at getting the document referenced by this
349      *         URI.
350      */
351     public int getFetchAttempts() {
352         return fetchAttempts;
353     }
354 
355     /***
356      * Increment the number of attempts at getting the document referenced by
357      * this URI.
358      *
359      * @return the number of attempts at getting the document referenced by this
360      *         URI.
361      */
362     public int incrementFetchAttempts() {
363         // TODO: rename, this is actually processing-loop-attempts
364         return fetchAttempts++;
365     }
366 
367     /***
368      * Reset fetchAttempts counter.
369      */
370     public void resetFetchAttempts() {
371         this.fetchAttempts = 0;
372     }
373 
374     /***
375      * Reset deferrals counter.
376      */
377     public void resetDeferrals() {
378         this.deferrals = 0;
379     }
380 
381     /***
382      * Get the next processor to process this URI.
383      *
384      * @return the processor that should process this URI next.
385      */
386     public Processor nextProcessor() {
387         return nextProcessor;
388     }
389 
390     /***
391      * Get the processor chain that should be processing this URI after the
392      * current chain is finished with it.
393      *
394      * @return the next processor chain to process this URI.
395      */
396     public ProcessorChain nextProcessorChain() {
397         return nextProcessorChain;
398     }
399 
400     /***
401      * Set the next processor to process this URI.
402      *
403      * @param processor the next processor to process this URI.
404      */
405     public void setNextProcessor(Processor processor) {
406         nextProcessor = processor;
407     }
408 
409     /***
410      * Set the next processor chain to process this URI.
411      *
412      * @param nextProcessorChain the next processor chain to process this URI.
413      */
414     public void setNextProcessorChain(ProcessorChain nextProcessorChain) {
415         this.nextProcessorChain = nextProcessorChain;
416     }
417 
418     /***
419      * Do all actions associated with setting a <code>CrawlURI</code> as
420      * requiring a prerequisite.
421      *
422      * @param lastProcessorChain Last processor chain reference.  This chain is
423      * where this <code>CrawlURI</code> goes next.
424      * @param preq Object to set a prerequisite.
425      * @throws URIException
426      */
427     public void markPrerequisite(String preq,
428             ProcessorChain lastProcessorChain) throws URIException {
429         Link link = createLink(preq,Link.PREREQ_MISC,Link.PREREQ_HOP);
430         setPrerequisiteUri(link);
431         incrementDeferrals();
432         setFetchStatus(S_DEFERRED);
433         skipToProcessorChain(lastProcessorChain);
434     }
435 
436     /***
437      * Set a prerequisite for this URI.
438      * <p>
439      * A prerequisite is a URI that must be crawled before this URI can be
440      * crawled.
441      *
442      * @param link Link to set as prereq.
443      */
444     public void setPrerequisiteUri(Object link) {
445         putObject(A_PREREQUISITE_URI, link);
446     }
447 
448     /***
449      * Get the prerequisite for this URI.
450      * <p>
451      * A prerequisite is a URI that must be crawled before this URI can be
452      * crawled.
453      *
454      * @return the prerequisite for this URI or null if no prerequisite.
455      */
456     public Object getPrerequisiteUri() {
457         return getObject(A_PREREQUISITE_URI);
458     }
459     
460     /***
461      * @return True if this CrawlURI has a prerequisite.
462      */
463     public boolean hasPrerequisiteUri() {
464         return containsKey(A_PREREQUISITE_URI);
465     }
466 
467     /***
468      * Returns true if this CrawlURI is a prerequisite.
469      *
470      * @return true if this CrawlURI is a prerequisite.
471      */
472     public boolean isPrerequisite() {
473         return this.prerequisite;
474     }
475 
476     /***
477      * Set if this CrawlURI is itself a prerequisite URI.
478      *
479      * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
480      */
481     public void setPrerequisite(boolean prerequisite) {
482         this.prerequisite = prerequisite;
483     }
484 
485     /***
486      * @return This crawl URI as a string wrapped with 'CrawlURI(' +
487      * ')'.
488      */
489     public String getCrawlURIString() {
490         if (this.cachedCrawlURIString == null) {
491             synchronized (this) {
492                 if (this.cachedCrawlURIString == null) {
493                     this.cachedCrawlURIString =
494                         "CrawlURI(" + toString() + ")";
495                 }
496             }
497         }
498         return this.cachedCrawlURIString;
499     }
500 
501     /***
502      * Get the content type of this URI.
503      *
504      * @return Fetched URIs content type.  May be null.
505      */
506     public String getContentType() {
507         return this.contentType;
508     }
509 
510     /***
511      * Set a fetched uri's content type.
512      *
513      * @param ct Contenttype.  May be null.
514      */
515     public void setContentType(String ct) {
516         this.contentType = ct;
517     }
518 
519     /***
520      * Set the number of the ToeThread responsible for processing this uri.
521      *
522      * @param i the ToeThread number.
523      */
524     public void setThreadNumber(int i) {
525         threadNumber = i;
526     }
527 
528     /***
529      * Get the number of the ToeThread responsible for processing this uri.
530      *
531      * @return the ToeThread number.
532      */
533     public int getThreadNumber() {
534         return threadNumber;
535     }
536 
537     /***
538      * Increment the deferral count.
539      *
540      */
541     public void incrementDeferrals() {
542         deferrals++;
543     }
544 
545     /***
546      * Get the deferral count.
547      *
548      * @return the deferral count.
549      */
550     public int getDeferrals() {
551         return deferrals;
552     }
553 
554     /***
555      * Remove all attributes set on this uri.
556      * <p>
557      * This methods removes the attribute list.
558      */
559     public void stripToMinimal() {
560         clearAList();
561     }
562 
563     /*** 
564      * Get the size in bytes of this URI's recorded content, inclusive
565      * of things like protocol headers. It is the responsibility of the 
566      * classes which fetch the URI to set this value accordingly -- it is 
567      * not calculated/verified within CrawlURI. 
568      * 
569      * This value is consulted in reporting/logging/writing-decisions.
570      * 
571      * @see #setContentSize()
572      * @return contentSize
573      */
574     public long getContentSize(){
575         return contentSize;
576     }
577 
578     /***
579      * Make note of a non-fatal error, local to a particular Processor,
580      * which should be logged somewhere, but allows processing to continue.
581      *
582      * This is how you add to the local-error log (the 'localized' in
583      * the below is making an error local rather than global, not
584      * making a swiss-french version of the error.).
585      * 
586      * @param processorName Name of processor the exception was thrown
587      * in.
588      * @param ex Throwable to log.
589      * @param message Extra message to log beyond exception message.
590      */
591     public void addLocalizedError(final String processorName,
592             final Throwable ex, final String message) {
593         List<LocalizedError> localizedErrors;
594         if (containsKey(A_LOCALIZED_ERRORS)) {
595             @SuppressWarnings("unchecked")
596             List<LocalizedError> temp // to prevent warning on cast
597              = (List<LocalizedError>) getObject(A_LOCALIZED_ERRORS);
598             localizedErrors = temp;
599         } else {
600             localizedErrors = new ArrayList<LocalizedError>();
601             putObject(A_LOCALIZED_ERRORS, localizedErrors);
602         }
603 
604         localizedErrors.add(new LocalizedError(processorName, ex, message));
605         addAnnotation("le:" + getClassSimpleName(ex.getClass()) + "@" +
606             processorName);
607     }
608     
609     // TODO: Move to utils.
610     protected String getClassSimpleName(final Class c) {
611         String classname = c.getName();
612         int index = classname.lastIndexOf('.');
613         return ((index > 0 && (index + 1) < classname.length())?
614             classname.substring(index + 1): classname);
615     }
616 
617     /***
618      * Add an annotation: an abbrieviated indication of something special
619      * about this URI that need not be present in every crawl.log line,
620      * but should be noted for future reference. 
621      *
622      * @param annotation the annotation to add; should not contain 
623      * whitespace or a comma
624      */
625     public void addAnnotation(String annotation) {
626         String annotations;
627         if(containsKey(A_ANNOTATIONS)) {
628             annotations = getString(A_ANNOTATIONS);
629             annotations += ","+annotation;
630         } else {
631             annotations = annotation;
632         }
633 
634         putString(A_ANNOTATIONS,annotations);
635     }
636     
637     /***
638      * TODO: Implement truncation using booleans rather than as this
639      * ugly String parse.
640      * @return True if fetch was truncated.
641      */
642     public boolean isTruncatedFetch() {
643         return annotationContains(TRUNC_SUFFIX);
644     }
645     
646     public boolean isLengthTruncatedFetch() {
647         return annotationContains(LENGTH_TRUNC);
648     }
649     
650     public boolean isTimeTruncatedFetch() {
651         return annotationContains(TIMER_TRUNC);
652     }
653     
654     public boolean isHeaderTruncatedFetch() {
655         return annotationContains(HEADER_TRUNC);
656     }
657     
658     protected boolean annotationContains(final String str2Find) {
659         boolean result = false;
660         if (!containsKey(A_ANNOTATIONS)) {
661             return result;
662         }
663         String annotations = getString(A_ANNOTATIONS);
664         if (annotations != null && annotations.length() > 0) {
665             result = annotations.indexOf(str2Find) >= 0;
666         }
667         return result;
668     }
669 
670     /***
671      * Get the annotations set for this uri.
672      *
673      * @return the annotations set for this uri.
674      */
675     public String getAnnotations() {
676         return (containsKey(A_ANNOTATIONS))?
677             getString(A_ANNOTATIONS): null;
678     }
679 
680     /***
681      * Get the embeded hop count.
682      *
683      * @return the embeded hop count.
684      * @deprecated 
685      */
686     public int getEmbedHopCount() {
687         return embedHopCount;
688     }
689 
690     /***
691      * Get the link hop count.
692      *
693      * @return the link hop count.
694      * @deprecated 
695      */
696     public int getLinkHopCount() {
697         return linkHopCount;
698     }
699 
700     /***
701      * Mark this uri as being a seed.
702      *
703      *
704      * @deprecated 
705      */
706     public void markAsSeed() {
707         linkHopCount = 0;
708         embedHopCount = 0;
709     }
710 
711     /***
712      * Get the user agent to use for crawling this URI.
713      *
714      * If null the global setting should be used.
715      *
716      * @return user agent or null
717      */
718     public String getUserAgent() {
719         return userAgent;
720     }
721 
722     /***
723      * Set the user agent to use when crawling this URI.
724      *
725      * If not set the global settings should be used.
726      *
727      * @param string user agent to use
728      */
729     public void setUserAgent(String string) {
730         userAgent = string;
731     }
732 
733     /***
734      * Set which processor should be the next processor to process this uri
735      * instead of using the default next processor.
736      *
737      * @param processorChain the processor chain to skip to.
738      * @param processor the processor in the processor chain to skip to.
739      */
740     public void skipToProcessor(ProcessorChain processorChain,
741             Processor processor) {
742         setNextProcessorChain(processorChain);
743         setNextProcessor(processor);
744     }
745 
746     /***
747      * Set which processor chain should be processing this uri next.
748      *
749      * @param processorChain the processor chain to skip to.
750      */
751     public void skipToProcessorChain(ProcessorChain processorChain) {
752         setNextProcessorChain(processorChain);
753         setNextProcessor(null);
754     }
755 
756     /***
757      * For completed HTTP transactions, the length of the content-body.
758      *
759      * @return For completed HTTP transactions, the length of the content-body.
760      */
761     public long getContentLength() {
762         if (this.contentLength < 0) {
763             this.contentLength = (getHttpRecorder() != null)?
764                 getHttpRecorder().getResponseContentLength(): 0;
765         }
766         return this.contentLength;
767     }
768     
769     /***
770      * Get size of data recorded (transferred)
771      *
772      * @return recorded data size
773      */
774     public long getRecordedSize() {
775         return (getHttpRecorder() != null)
776                     ?  getHttpRecorder().getRecordedInput().getSize()
777                     // if unavailable fall back on content-size
778                     : getContentSize(); 
779     }
780 
781     /***
782      * Sets the 'content size' for the URI, which is considered inclusive
783      * of all recorded material (such as protocol headers) or even material
784      * 'virtually' considered (as in material from a previous fetch 
785      * confirmed unchanged with a server). (In contrast, content-length 
786      * matches the HTTP definition, that of the enclosed content-body.)
787      * 
788      * Should be set by a fetcher or other processor as soon as the final 
789      * size of recorded content is known. Setting to an artificial/incorrect
790      * value may affect other reporting/processing. 
791      * 
792      * @param l Content size.
793      */
794     public void setContentSize(long l) {
795         contentSize = l;
796     }
797 
798     /***
799      * If true then a link extractor has already claimed this CrawlURI and
800      * performed link extraction on the document content. This does not
801      * preclude other link extractors that may have an interest in this
802      * CrawlURI from also doing link extraction but default behavior should
803      * be to not run if link extraction has already been done.
804      * 
805      * <p>There is an onus on link extractors to set this flag if they have
806      * run.
807      * 
808      * <p>The only extractor of the default Heritrix set that does not
809      * respect this flag is
810      * {@link org.archive.crawler.extractor.ExtractorHTTP}.
811      * It runs against HTTP headers, not the document content.
812      * 
813      * @return True if a processor has performed link extraction on this
814      * CrawlURI
815      *
816      * @see #linkExtractorFinished()
817      */
818     public boolean hasBeenLinkExtracted(){
819         return linkExtractorFinished;
820     }
821 
822     /***
823      * Note that link extraction has been performed on this CrawlURI. A processor
824      * doing link extraction should invoke this method once it has finished it's
825      * work. It should invoke it even if no links are extracted. It should only
826      * invoke this method if the link extraction was performed on the document
827      * body (not the HTTP headers etc.).
828      *
829      * @see #hasBeenLinkExtracted()
830      */
831     public void linkExtractorFinished() {
832         linkExtractorFinished = true;
833         if(discardedOutlinks>0) {
834             addAnnotation("dol:"+discardedOutlinks);
835         }
836     }
837 
838     /***
839      * Notify CrawlURI it is about to be logged; opportunity
840      * for self-annotation
841      */
842     public void aboutToLog() {
843         if (fetchAttempts>1) {
844             addAnnotation(fetchAttempts+"t");
845         }
846     }
847 
848     /***
849      * Get the http recorder associated with this uri.
850      *
851      * @return Returns the httpRecorder.  May be null but its set early in
852      * FetchHttp so there is an issue if its null.
853      */
854     public HttpRecorder getHttpRecorder() {
855         return httpRecorder;
856     }
857 
858     /***
859      * Set the http recorder to be associated with this uri.
860      *
861      * @param httpRecorder The httpRecorder to set.
862      */
863     public void setHttpRecorder(HttpRecorder httpRecorder) {
864         this.httpRecorder = httpRecorder;
865     }
866 
867     /***
868      * Return true if this is a http transaction.
869      *
870      * TODO: Compound this and {@link #isPost()} method so that there is one
871      * place to go to find out if get http, post http, ftp, dns.
872      *
873      * @return True if this is a http transaction.
874      */
875     public boolean isHttpTransaction() {
876         return containsKey(A_HTTP_TRANSACTION);
877     }
878 
879     /***
880      * Clean up after a run through the processing chain.
881      *
882      * Called on the end of processing chain by Frontier#finish.  Null out any
883      * state gathered during processing.
884      */
885     public void processingCleanup() {
886         this.httpRecorder = null;
887         this.fetchStatus = S_UNATTEMPTED;
888         this.setPrerequisite(false);
889         this.contentSize = UNCALCULATED;
890         this.contentLength = UNCALCULATED;
891         // Clear 'links extracted' flag.
892         this.linkExtractorFinished = false;
893         // Clean the alist of all but registered permanent members.
894         setAList(getPersistentAList());
895     }
896     
897     public AList getPersistentAList() {
898         AList newAList = new HashtableAList();
899         // copy declared persistent keys
900         if(alistPersistentMember!=null && alistPersistentMember.size() > 0) {
901             newAList.copyKeysFrom(alistPersistentMember.iterator(), getAList());
902         } 
903         // also copy declared 'heritable' keys
904         List heritableKeys = (List) getObject(A_HERITABLE_KEYS);
905         if(heritableKeys!=null) {
906             newAList.copyKeysFrom(heritableKeys.iterator(), getAList());
907         }
908         return newAList;
909     }
910 
911     /***
912      * Make a <code>CrawlURI</code> from the passed <code>CandidateURI</code>.
913      *
914      * Its safe to pass a CrawlURI instance.  In this case we just return it
915      * as a result. Otherwise, we create new CrawlURI instance.
916      *
917      * @param caUri Candidate URI.
918      * @param ordinal
919      * @return A crawlURI made from the passed CandidateURI.
920      */
921     public static CrawlURI from(CandidateURI caUri, long ordinal) {
922         return (caUri instanceof CrawlURI)?
923             (CrawlURI)caUri: new CrawlURI(caUri, ordinal);
924     }
925 
926     /***
927      * @param avatars Credential avatars to save off.
928      */
929     private void setCredentialAvatars(Set avatars) {
930         putObject(A_CREDENTIAL_AVATARS_KEY, avatars);
931     }
932 
933     /***
934      * @return Credential avatars.  Null if none set.
935      */
936     @SuppressWarnings("unchecked")
937     public Set<CredentialAvatar> getCredentialAvatars() {
938         return (Set)getObject(A_CREDENTIAL_AVATARS_KEY);
939     }
940 
941     /***
942      * @return True if there are avatars attached to this instance.
943      */
944     public boolean hasCredentialAvatars() {
945         return getCredentialAvatars() != null &&
946             getCredentialAvatars().size() > 0;
947     }
948 
949     /***
950      * Add an avatar.
951      *
952      * We do lazy instantiation.
953      *
954      * @param ca Credential avatar to add to set of avatars.
955      */
956     public void addCredentialAvatar(CredentialAvatar ca) {
957         Set<CredentialAvatar> avatars = getCredentialAvatars();
958         if (avatars == null) {
959             avatars = new HashSet<CredentialAvatar>();
960             setCredentialAvatars(avatars);
961         }
962         avatars.add(ca);
963     }
964 
965     /***
966      * Remove all credential avatars from this crawl uri.
967      */
968     public void removeCredentialAvatars() {
969         if (hasCredentialAvatars()) {
970             remove(A_CREDENTIAL_AVATARS_KEY);
971         }
972     }
973 
974     /***
975      * Remove all credential avatars from this crawl uri.
976      * @param ca Avatar to remove.
977      * @return True if we removed passed parameter.  False if no operation
978      * performed.
979      */
980     public boolean removeCredentialAvatar(CredentialAvatar ca) {
981         boolean result = false;
982         Set avatars = getCredentialAvatars();
983         if (avatars != null && avatars.size() > 0) {
984             result = avatars.remove(ca);
985         }
986         return result;
987     }
988 
989     /***
990      * Ask this URI if it was a success or not.
991      *
992      * Only makes sense to call this method after execution of
993      * HttpMethod#execute. Regard any status larger then 0 as success
994      * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if
995      * looking for a status code in the 200 range.
996      *
997      * <p>401s caveat: If any rfc2617 credential data present and we got a 401
998      * assume it got loaded in FetchHTTP on expectation that we're to go around
999      * the processing chain again. Report this condition as a failure so we
1000      * get another crack at the processing chain only this time we'll be making
1001      * use of the loaded credential data.
1002      *
1003      * @return True if ths URI has been successfully processed.
1004      * @see #is2XXSuccess()
1005      */
1006     public boolean isSuccess() {
1007         boolean result = false;
1008         int statusCode = this.fetchStatus;
1009         if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
1010             hasRfc2617CredentialAvatar()) {
1011             result = false;
1012         } else {
1013             result = (statusCode > 0);
1014         }
1015         return result;
1016     }
1017     
1018     /***
1019      * @return True if status code is in the 2xx range.
1020      * @see #isSuccess()
1021      */
1022     public boolean is2XXSuccess() {
1023     	return this.fetchStatus >= 200 && this.fetchStatus < 300;
1024     }
1025 
1026     /***
1027 	 * @return True if we have an rfc2617 payload.
1028 	 */
1029 	public boolean hasRfc2617CredentialAvatar() {
1030 	    boolean result = false;
1031 	    Set avatars = getCredentialAvatars();
1032 	    if (avatars != null && avatars.size() > 0) {
1033 	        for (Iterator i = avatars.iterator(); i.hasNext();) {
1034 	            if (((CredentialAvatar)i.next()).
1035 	                match(Rfc2617Credential.class)) {
1036 	                result = true;
1037 	                break;
1038 	            }
1039 	        }
1040 	    }
1041         return result;
1042 	}
1043 
1044     /***
1045      * Set whether this URI should be fetched by sending a HTTP POST request.
1046      * Else a HTTP GET request will be used.
1047      *
1048      * @param b Set whether this curi is to be POST'd.  Else its to be GET'd.
1049      */
1050     public void setPost(boolean b) {
1051         this.post = b;
1052     }
1053 
1054     /***
1055      * Returns true if this URI should be fetched by sending a HTTP POST request.
1056      *
1057      *
1058      * TODO: Compound this and {@link #isHttpTransaction()} method so that there
1059      * is one place to go to find out if get http, post http, ftp, dns.
1060      *
1061      * @return Returns is this CrawlURI instance is to be posted.
1062      */
1063     public boolean isPost() {
1064         return this.post;
1065     }
1066 
1067     /***
1068      * Set the retained content-digest value (usu. SHA1). 
1069      * 
1070      * @param digestValue
1071      * @deprecated Use {@link #setContentDigest(String scheme, byte[])}
1072      */
1073     public void setContentDigest(byte[] digestValue) {
1074         setContentDigest("SHA1", digestValue);
1075     }
1076     
1077     public void setContentDigest(final String scheme,
1078             final byte [] digestValue) {
1079         this.contentDigest = digestValue;
1080         this.contentDigestScheme = scheme;
1081     }
1082     
1083     public String getContentDigestSchemeString() {
1084         if(this.contentDigest==null) {
1085             return null;
1086         }
1087         return this.contentDigestScheme + ":" + getContentDigestString();
1088     }
1089 
1090     /***
1091      * Return the retained content-digest value, if any.
1092      * 
1093      * @return Digest value.
1094      */
1095     public Object getContentDigest() {
1096         return contentDigest;
1097     }
1098     
1099     public String getContentDigestString() {
1100         if(this.contentDigest==null) {
1101             return null;
1102         }
1103         return Base32.encode(this.contentDigest);
1104     }
1105 
1106     transient Object holder;
1107     transient Object holderKey;
1108 
1109     /***
1110      * Remember a 'holder' to which some enclosing/queueing
1111      * facility has assigned this CrawlURI
1112      * .
1113      * @param obj
1114      */
1115     public void setHolder(Object obj) {
1116         holder=obj;
1117     }
1118 
1119     /***
1120      * Return the 'holder' for the convenience of 
1121      * an external facility.
1122      *
1123      * @return holder
1124      */
1125     public Object getHolder() {
1126         return holder;
1127     }
1128 
1129     /***
1130      * Remember a 'holderKey' which some enclosing/queueing
1131      * facility has assigned this CrawlURI
1132      * .
1133      * @param obj
1134      */
1135     public void setHolderKey(Object obj) {
1136         holderKey=obj;
1137     }
1138     /***
1139      * Return the 'holderKey' for convenience of 
1140      * an external facility (Frontier).
1141      * 
1142      * @return holderKey 
1143      */
1144     public Object getHolderKey() {
1145         return holderKey;
1146     }
1147 
1148     /***
1149      * Get the ordinal (serial number) assigned at creation.
1150      * 
1151      * @return ordinal
1152      */
1153     public long getOrdinal() {
1154         return ordinal;
1155     }
1156 
1157     /*** spot for an integer cost to be placed by external facility (frontier).
1158      *  cost is truncated to 8 bits at times, so should not exceed 255 */
1159     int holderCost = UNCALCULATED;
1160     /***
1161      * Return the 'holderCost' for convenience of external facility (frontier)
1162      * @return value of holderCost
1163      */
1164     public int getHolderCost() {
1165         return holderCost;
1166     }
1167 
1168     /***
1169      * Remember a 'holderCost' which some enclosing/queueing
1170      * facility has assigned this CrawlURI
1171      * @param cost value to remember
1172      */
1173     public void setHolderCost(int cost) {
1174         holderCost = cost;
1175     }
1176 
1177     /*** 
1178      * All discovered outbound Links (navlinks, embeds, etc.) 
1179      * Can either contain Link instances or CandidateURI instances, or both.
1180      * The LinksScoper processor converts Link instances in this collection
1181      * to CandidateURI instances. 
1182      */
1183     transient Collection<Object> outLinks = new HashSet<Object>();
1184     
1185     /***
1186      * Returns discovered links.  The returned collection might be empty if
1187      * no links were discovered, or if something like LinksScoper promoted
1188      * the links to CandidateURIs.
1189      * 
1190      * Elements can be removed from the returned collection, but not added.
1191      * To add a discovered link, use one of the createAndAdd methods or
1192      * {@link #getOutObjects()}.
1193      * 
1194      * @return Collection of all discovered outbound Links
1195      */
1196     public Collection<Link> getOutLinks() {
1197         return Transform.subclasses(outLinks, Link.class);
1198     }
1199     
1200     /***
1201      * Returns discovered candidate URIs.  The returned collection will be
1202      * emtpy until something like LinksScoper promotes discovered Links
1203      * into CandidateURIs.
1204      * 
1205      * Elements can be removed from the returned collection, but not added.
1206      * To add a candidate URI, use {@link #replaceOutlinks(Collection)} or
1207      * {@link #getOutObjects}.
1208      * 
1209      * @return  Collection of candidate URIs
1210      */
1211     public Collection<CandidateURI> getOutCandidates() {
1212         return Transform.subclasses(outLinks, CandidateURI.class);
1213     }
1214     
1215     
1216     /***
1217      * Returns all of the outbound objects.  The returned Collection will
1218      * contain Link instances, or CandidateURI instances, or both.  
1219      * 
1220      * @return  the collection of Links and/or CandidateURIs
1221      */
1222     public Collection<Object> getOutObjects() {
1223         return outLinks;
1224     }
1225     
1226     /***
1227      * Add a discovered Link, unless it would exceed the max number
1228      * to accept. (If so, increment discarded link counter.) 
1229      * 
1230      * @param link the Link to add
1231      */
1232     public void addOutLink(Link link) {
1233         if (outLinks.size() < MAX_OUTLINKS) {
1234             outLinks.add(link);
1235         } else {
1236             // note & discard
1237             discardedOutlinks++;
1238         }
1239     }
1240     
1241     public void clearOutlinks() {
1242         this.outLinks.clear();
1243     }
1244     
1245     /***
1246      * Replace current collection of links w/ passed list.
1247      * Used by Scopers adjusting the list of links (removing those
1248      * not in scope and promoting Links to CandidateURIs).
1249      * 
1250      * @param a collection of CandidateURIs replacing any previously
1251      *   existing outLinks or outCandidates
1252      */
1253     public void replaceOutlinks(Collection<CandidateURI> links) {
1254         clearOutlinks();
1255         this.outLinks.addAll(links);
1256     }
1257     
1258     
1259     /***
1260      * @return Count of outlinks.
1261      */
1262     public int outlinksSize() {
1263         return this.outLinks.size();
1264     }
1265 
1266     /***
1267      * Convenience method for creating a Link discovered at this URI
1268      * with the given string and context
1269      * 
1270      * @param url
1271      *            String to use to create Link
1272      * @param context
1273      *            CharSequence context to use
1274      * @param hopType
1275      * @return Link.
1276      * @throws URIException
1277      *             if Link UURI cannot be constructed
1278      */
1279     public Link createLink(String url, CharSequence context,
1280             char hopType) throws URIException {
1281         return new Link(getUURI(), UURIFactory.getInstance(getUURI(),
1282                 url), context, hopType);
1283     }
1284     
1285     /***
1286      * Convenience method for creating a Link with the given string and
1287      * context
1288      * 
1289      * @param url
1290      *            String to use to create Link
1291      * @param context
1292      *            CharSequence context to use
1293      * @param hopType
1294      * @throws URIException
1295      *             if Link UURI cannot be constructed
1296      */
1297     public void createAndAddLink(String url, CharSequence context,
1298             char hopType) throws URIException {
1299         addOutLink(createLink(url, context, hopType));
1300     }
1301 
1302     /***
1303      * Convenience method for creating a Link with the given string and
1304      * context, relative to a previously set base HREF if available (or
1305      * relative to the current CrawlURI if no other base has been set)
1306      * 
1307      * @param url String URL to add as destination of link
1308      * @param context String context where link was discovered
1309      * @param hopType char hop-type indicator
1310      * @throws URIException
1311      */
1312     public void createAndAddLinkRelativeToBase(String url,
1313             CharSequence context, char hopType) throws URIException {
1314         addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1315                 getBaseURI(), url), context, hopType));
1316     }
1317     
1318     /***
1319      * Convenience method for creating a Link with the given string and
1320      * context, relative to this CrawlURI's via UURI if available. (If
1321      * a via is not available, falls back to using 
1322      * #createAndAddLinkRelativeToBase.)
1323      * 
1324      * @param url String URL to add as destination of link
1325      * @param context String context where link was discovered
1326      * @param hopType char hop-type indicator
1327      * @throws URIException
1328      */
1329     public void createAndAddLinkRelativeToVia(String url,
1330             CharSequence context, char hopType) throws URIException {
1331         if(getVia()!=null) {
1332             addOutLink(new Link(getUURI(), UURIFactory.getInstance(
1333                 getVia(), url), context, hopType));
1334         } else {
1335             // if no 'via', fall back to base/self
1336             createAndAddLinkRelativeToBase(url,context,hopType);
1337         }
1338     }
1339     
1340     /***
1341      * Set the (HTML) Base URI used for derelativizing internal URIs. 
1342      * 
1343      * @param baseHref String base href to use
1344      * @throws URIException if supplied string cannot be interpreted as URI
1345      */
1346     public void setBaseURI(String baseHref) throws URIException {
1347         putObject(A_HTML_BASE, UURIFactory.getInstance(baseHref));
1348     }
1349       
1350     /***
1351      * Get the (HTML) Base URI used for derelativizing internal URIs. 
1352      *
1353      * @return UURI base URI previously set 
1354      */  
1355     public UURI getBaseURI() {
1356         if (!containsKey(A_HTML_BASE)) {
1357             return getUURI();
1358         }
1359         return (UURI)getObject(A_HTML_BASE);
1360     }
1361     
1362     /***
1363      * Add the key of alist items you want to persist across
1364      * processings.
1365      * @param key Key to add.
1366      */
1367     public static void addAlistPersistentMember(Object key) {
1368         alistPersistentMember.add(key);
1369     }
1370     
1371     /***
1372      * @param key Key to remove.
1373      * @return True if list contained the element.
1374      */
1375     public static boolean removeAlistPersistentMember(Object key) {
1376         return alistPersistentMember.remove(key);
1377     }
1378 
1379     /***
1380      * Custom serialization writing an empty 'outLinks' as null. Estimated
1381      * to save ~20 bytes in serialized form. 
1382      * 
1383      * @param stream
1384      * @throws IOException
1385      */
1386     private void writeObject(ObjectOutputStream stream) throws IOException {
1387         stream.defaultWriteObject();
1388         stream.writeObject((outLinks.isEmpty()) ? null : outLinks);
1389     }
1390 
1391     /***
1392      * Custom deserialization recreating empty HashSet from null in 'outLinks'
1393      * slot. 
1394      * 
1395      * @param stream
1396      * @throws IOException
1397      * @throws ClassNotFoundException
1398      */
1399     private void readObject(ObjectInputStream stream) throws IOException,
1400             ClassNotFoundException {
1401         stream.defaultReadObject();
1402         @SuppressWarnings("unchecked")
1403         HashSet<Object> ol = (HashSet<Object>) stream.readObject();
1404         outLinks = (ol == null) ? new HashSet<Object>() : ol;
1405     }
1406 
1407     public long getFetchDuration() {
1408         if(! containsKey(A_FETCH_COMPLETED_TIME)) {
1409             return -1;
1410         }
1411         
1412         long completedTime = getLong(A_FETCH_COMPLETED_TIME);
1413         long beganTime = getLong(A_FETCH_BEGAN_TIME);
1414         return completedTime - beganTime;
1415     }
1416 
1417 
1418 }