1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.util.Vector;
32 import java.util.logging.Logger;
33 import java.util.regex.Matcher;
34
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.framework.CrawlController;
38 import org.archive.util.TextUtils;
39
40 import com.anotherbigidea.flash.interfaces.SWFActions;
41 import com.anotherbigidea.flash.interfaces.SWFTagTypes;
42 import com.anotherbigidea.flash.readers.SWFReader;
43 import com.anotherbigidea.flash.readers.TagParser;
44 import com.anotherbigidea.flash.structs.AlphaTransform;
45 import com.anotherbigidea.flash.structs.Matrix;
46 import com.anotherbigidea.flash.writers.SWFActionsImpl;
47 import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
48 import com.anotherbigidea.io.InStream;
49
50 /***
51 * Process SWF (flash/shockwave) files for strings that are likely to be
52 * crawlable URIs.
53 *
54 * @author Igor Ranitovic
55 */
56 public class ExtractorSWF extends Extractor implements CoreAttributeConstants {
57
58 private static final long serialVersionUID = 3627359592408010589L;
59
60 private static Logger logger = Logger.getLogger(ExtractorSWF.class
61 .getName());
62
63 protected long numberOfCURIsHandled = 0;
64
65 protected long numberOfLinksExtracted = 0;
66
67
68
69 private static final int MAX_READ_SIZE = 1024 * 1024;
70
71 /***
72 * @param name
73 */
74 public ExtractorSWF(String name) {
75 super(name, "Flash extractor. Extracts URIs from SWF "
76 + "(flash/shockwave) files.");
77 }
78
79 protected void extract(CrawlURI curi) {
80 if (!isHttpTransactionContentToProcess(curi)) {
81 return;
82 }
83
84 String contentType = curi.getContentType();
85 if (contentType == null) {
86 return;
87 }
88 if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
89 && (!curi.toString().toLowerCase().endsWith(".swf"))) {
90 return;
91 }
92
93
94 SWFReader reader = getSWFReader(curi);
95 if (reader == null) {
96 return;
97 }
98
99 numberOfCURIsHandled++;
100 try {
101
102 reader.readFile();
103 } catch (IOException e) {
104 curi.addLocalizedError(getName(), e, "failed reading");
105 } catch (NullPointerException e) {
106 curi.addLocalizedError(getName(), e, "bad .swf file");
107 } catch (NegativeArraySizeException e) {
108 curi.addLocalizedError(getName(), e, "bad .swf file");
109 }
110
111
112 curi.linkExtractorFinished();
113 logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
114
115 }
116
117 /***
118 * Get a link extracting SWFParser.
119 *
120 * A custom SWFReader which parses links from .swf file.
121 *
122 * @param curi A CrawlURI to be processed.
123 * @return An SWFReader.
124 */
125 private SWFReader getSWFReader(CrawlURI curi) {
126
127 InputStream documentStream = null;
128
129 try {
130 documentStream = curi.getHttpRecorder().getRecordedInput()
131 .getContentReplayInputStream();
132 if (documentStream == null) {
133 return null;
134 }
135 } catch (IOException e) {
136 curi.addLocalizedError(getName(), e, "Fail reading.");
137 } finally {
138 try {
139 documentStream.close();
140 } catch (IOException e) {
141 curi.addLocalizedError(getName(), e, "Fail on close.");
142 }
143 }
144
145
146
147 ExtractorSWFActions actions = new ExtractorSWFActions(curi,
148 getController());
149
150 ExtractorSWFTags tags = new ExtractorSWFTags(actions);
151
152 SWFReader reader = new SWFReader(getTagParser(tags), documentStream) {
153 /***
154 * Override because a corrupt SWF file can cause us to try read
155 * lengths that are hundreds of megabytes in size causing us to
156 * OOME.
157 *
158 * Below is copied from SWFReader parent class.
159 */
160 public int readOneTag() throws IOException {
161 int header = mIn.readUI16();
162 int type = header >> 6;
163 int length = header & 0x3F;
164 boolean longTag = (length == 0x3F);
165 if (longTag) {
166 length = (int) mIn.readUI32();
167 }
168
169 if (length > MAX_READ_SIZE) {
170
171
172 mIn.skipBytes(length);
173 logger.info("oversized SWF tag (type=" + type + ";length="
174 + length + ") skipped");
175 } else {
176 byte[] contents = mIn.read(length);
177 mConsumer.tag(type, longTag, contents);
178 }
179 return type;
180 }
181 };
182 return reader;
183 }
184
185 /***
186 * Get a TagParser
187 *
188 * A custom ExtractorTagParser which ignores all the big binary image/
189 * sound/font types which don't carry URLs is used, to avoid the
190 * occasionally fatal (OutOfMemoryError) memory bloat caused by the
191 * all-in-memory SWF library handling.
192 *
193 * @param customTags
194 * A custom tag parser.
195 * @return An SWFReader.
196 */
197 private TagParser getTagParser(SWFTagTypes customTags) {
198 return new ExtractorTagParser(customTags);
199 }
200
201 /***
202 * TagParser customized to ignore SWFTags that will never contain
203 * extractable URIs.
204 */
205 protected class ExtractorTagParser extends TagParser {
206
207 protected ExtractorTagParser(SWFTagTypes tagtypes) {
208 super(tagtypes);
209 }
210
211 protected void parseDefineBits(InStream in) throws IOException {
212
213 }
214
215 protected void parseDefineBitsJPEG3(InStream in) throws IOException {
216
217 }
218
219 protected void parseDefineBitsLossless(InStream in, int length,
220 boolean hasAlpha) throws IOException {
221
222 }
223
224 protected void parseDefineButtonSound(InStream in) throws IOException {
225
226 }
227
228 protected void parseDefineFont(InStream in) throws IOException {
229
230 }
231
232 protected void parseDefineJPEG2(InStream in, int length)
233 throws IOException {
234
235 }
236
237 protected void parseDefineJPEGTables(InStream in) throws IOException {
238
239 }
240
241 protected void parseDefineShape(int type, InStream in)
242 throws IOException {
243
244 }
245
246 protected void parseDefineSound(InStream in) throws IOException {
247
248 }
249
250 protected void parseFontInfo(InStream in, int length, boolean isFI2)
251 throws IOException {
252
253 }
254
255 protected void parseDefineFont2(InStream in) throws IOException {
256
257 }
258 }
259
260 /***
261 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which
262 * parse URI-like strings.
263 */
264 protected class ExtractorSWFTags extends SWFTagTypesImpl {
265
266 private SWFActions actions;
267
268 public ExtractorSWFTags(SWFActions acts) {
269 super(null);
270 actions = acts;
271 }
272
273 public SWFActions tagDefineButton(int id, Vector buttonRecords)
274 throws IOException {
275
276 return actions;
277 }
278
279 public SWFActions tagDefineButton2(int id, boolean trackAsMenu,
280 Vector buttonRecord2s) throws IOException {
281
282 return actions;
283 }
284
285 public SWFActions tagDoAction() throws IOException {
286 return actions;
287 }
288
289 public SWFActions tagDoInActions(int spriteId) throws IOException {
290 return actions;
291 }
292
293 public SWFTagTypes tagDefineSprite(int id) throws IOException {
294 return this;
295 }
296
297 public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,
298 int depth, int charId, Matrix matrix, AlphaTransform cxform,
299 int ratio, String name, int clipActionFlags) throws IOException {
300
301 return actions;
302 }
303
304 }
305
306 /***
307 * SWFActions that parse URI-like strings. Links discovered using
308 * <code>ExtractorJS</code> are marked as speculative links (hop X). All
309 * other links are marked as embedded links (hop E).
310 *
311 */
312 protected class ExtractorSWFActions extends SWFActionsImpl {
313
314 private CrawlURI curi;
315
316 private CrawlController controller;
317
318 static final String JSSTRING = "javascript:";
319
320 /***
321 * @param curi
322 * SWF URL to handle
323 * @param controller
324 * Crawl controller need for error reporting
325 */
326 public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {
327 assert (curi != null) : "CrawlURI should not be null";
328 this.curi = curi;
329 this.controller = controller;
330 }
331
332 /***
333 * Overwrite handling of discovered URIs.
334 *
335 * @param url
336 * Discovered URL.
337 * @param target
338 * Discovered target (currently not being used.)
339 * @throws IOException
340 */
341 public void getURL(String url, String target) throws IOException {
342
343
344
345
346
347 processURIString(url);
348 }
349
350 public void lookupTable(String[] strings) throws IOException {
351 for (String str : strings) {
352 considerStringAsUri(str);
353 }
354 }
355
356 public void push(String value) throws IOException {
357 considerStringAsUri(value);
358 }
359
360 public void considerStringAsUri(String str) throws IOException {
361 Matcher uri = TextUtils.getMatcher(ExtractorJS.STRING_URI_DETECTOR,
362 str);
363
364 if (uri.matches()) {
365 curi.createAndAddLinkRelativeToVia(uri.group(),
366 Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);
367 incrementLinkCount(1);
368 }
369 TextUtils.recycleMatcher(uri);
370 }
371
372 public void processURIString(String url) throws IOException {
373 if (url.startsWith(JSSTRING)) {
374 incrementLinkCount(ExtractorJS.considerStrings(
375 curi, url, controller,false));
376 } else {
377 curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
378 Link.EMBED_HOP);
379 incrementLinkCount(1);
380 }
381 }
382
383 private void incrementLinkCount(long count) {
384 numberOfLinksExtracted += count;
385 }
386 }
387
388 public String report() {
389 StringBuffer ret = new StringBuffer();
390 ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
391 ret.append(" Function: Link extraction on Shockwave Flash "
392 + "documents (.swf)\n");
393
394 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
395 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
396 return ret.toString();
397 }
398
399 }