1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.extractor.Link;
29 import org.archive.crawler.settings.SimpleType;
30 import org.archive.crawler.settings.Type;
31
32
33
34 /***
35 * Rule ACCEPTs any CrawlURIs whose path-from-seed ('hopsPath' -- see
36 * {@link CandidateURI#getPathFromSeed()}) ends
37 * with at least one, but not more than, the given number of
38 * non-navlink ('L') hops.
39 *
40 * Otherwise, if the path-from-seed is empty or if a navlink ('L') occurs
41 * within max-trans-hops of the tail of the path-from-seed, this rule
42 * returns PASS.
43 *
44 * <p>Thus, it allows things like embedded resources (frames/images/media)
45 * and redirects to be transitively included ('transcluded') in a crawl,
46 * even if they otherwise would not, for some reasonable number of hops
47 * (1-4).
48 *
49 * @see <a href="http://www.google.com/search?q=define%3Atransclusion&sourceid=mozilla&start=0&start=0&ie=utf-8&oe=utf-8">Transclusion</a>
50 *
51 * @author gojomo
52 */
53 public class TransclusionDecideRule extends PredicatedDecideRule {
54
55 private static final long serialVersionUID = -3975688876990558918L;
56
57 private static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops";
58
59 private static final String ATTR_MAX_SPECULATIVE_HOPS = "max-speculative-hops";
60
61 /***
62 * Default maximum transitive hops -- any type
63 * Default access so can be accessed by unit tests.
64 */
65 static final Integer DEFAULT_MAX_TRANS_HOPS = new Integer(3);
66
67 /***
68 * Default maximum speculative ('X') hops.
69 * Default access so can be accessed by unit tests.
70 */
71 static final Integer DEFAULT_MAX_SPECULATIVE_HOPS = new Integer(1);
72
73 /***
74 * Usual constructor.
75 * @param name Name of this DecideRule.
76 */
77 public TransclusionDecideRule(String name) {
78 super(name);
79 setDescription("TransclusionDecideRule. ACCEPTs URIs whose path " +
80 "from the seed ends with up to (but not more than) the " +
81 "configured '" + ATTR_MAX_TRANS_HOPS +
82 "' number of non-navlink ('L') hops.");
83
84 Type type = getElementFromDefinition(ATTR_DECISION);
85 type.setTransient(true);
86 addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
87 "Maximum number of non-navlink (non-'L') hops to ACCEPT.",
88 DEFAULT_MAX_TRANS_HOPS));
89 addElementToDefinition(new SimpleType(ATTR_MAX_SPECULATIVE_HOPS,
90 "Maximum number of speculative ('X') hops to ACCEPT.",
91 DEFAULT_MAX_SPECULATIVE_HOPS));
92 }
93
94 /***
95 * Evaluate whether given object is within the threshold number of
96 * transitive hops.
97 *
98 * @param object Object to make decision on.
99 * @return true if the transitive hops >0 and <= max
100 */
101 protected boolean evaluate(Object object) {
102 CandidateURI curi = null;
103 try {
104 curi = (CandidateURI)object;
105 } catch (ClassCastException e) {
106
107 return false;
108 }
109 String hopsPath = curi.getPathFromSeed();
110 if (hopsPath == null || hopsPath.length() == 0) {
111 return false;
112 }
113 int count = 0;
114 int specCount = 0;
115 for (int i = hopsPath.length() - 1; i >= 0; i--) {
116 char c = hopsPath.charAt(i);
117 if (c != Link.NAVLINK_HOP) {
118 count++;
119 if(c == Link.SPECULATIVE_HOP) {
120 specCount++;
121 }
122 } else {
123 break;
124 }
125 }
126 return count > 0 && (specCount <= getThresholdSpeculativeHops(object) && count <= getThresholdHops(object));
127 }
128
129 /***
130 * @param obj Context object.
131 * @return hops cutoff threshold
132 */
133 private int getThresholdHops(Object obj) {
134 return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_TRANS_HOPS)).
135 intValue();
136 }
137
138 /***
139 * @param obj Context object.
140 * @return hops cutoff threshold
141 */
142 private int getThresholdSpeculativeHops(Object obj) {
143 return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_SPECULATIVE_HOPS)).
144 intValue();
145 }
146 }