1   /*
2    * Heritrix
3    *
4    * $Id: ExtractorSWF.java 5465 2007-09-05 09:27:28Z ia_igor $
5    *
6    * Created on March 19, 2004
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.util.Vector;
32  import java.util.logging.Logger;
33  import java.util.regex.Matcher;
34  
35  import org.archive.crawler.datamodel.CoreAttributeConstants;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.framework.CrawlController;
38  import org.archive.util.TextUtils;
39  
40  import com.anotherbigidea.flash.interfaces.SWFActions;
41  import com.anotherbigidea.flash.interfaces.SWFTagTypes;
42  import com.anotherbigidea.flash.readers.SWFReader;
43  import com.anotherbigidea.flash.readers.TagParser;
44  import com.anotherbigidea.flash.structs.AlphaTransform;
45  import com.anotherbigidea.flash.structs.Matrix;
46  import com.anotherbigidea.flash.writers.SWFActionsImpl;
47  import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
48  import com.anotherbigidea.io.InStream;
49  
50  /***
51   * Process SWF (flash/shockwave) files for strings that are likely to be
52   * crawlable URIs.
53   * 
54   * @author Igor Ranitovic
55   */
56  public class ExtractorSWF extends Extractor implements CoreAttributeConstants {
57  
58  	private static final long serialVersionUID = 3627359592408010589L;
59  
60  	private static Logger logger = Logger.getLogger(ExtractorSWF.class
61  			.getName());
62  
63  	protected long numberOfCURIsHandled = 0;
64  
65  	protected long numberOfLinksExtracted = 0;
66  
67  	// TODO: consider if this should be even smaller, because anything
68  	// containing URLs wouldn't be this big
69  	private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB
70  
71  	/***
72  	 * @param name
73  	 */
74  	public ExtractorSWF(String name) {
75  		super(name, "Flash extractor. Extracts URIs from SWF "
76  				+ "(flash/shockwave) files.");
77  	}
78  
79  	protected void extract(CrawlURI curi) {
80  		if (!isHttpTransactionContentToProcess(curi)) {
81  			return;
82  		}
83  
84  		String contentType = curi.getContentType();
85  		if (contentType == null) {
86  			return;
87  		}
88  		if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
89  				&& (!curi.toString().toLowerCase().endsWith(".swf"))) {
90  			return;
91  		}
92  
93  		// Get link extracting SWF reader
94  		SWFReader reader = getSWFReader(curi);
95  		if (reader == null) {
96  			return;
97  		}
98  
99  		numberOfCURIsHandled++;
100 		try {
101 			// Parse file for links
102 			reader.readFile();
103 		} catch (IOException e) {
104 			curi.addLocalizedError(getName(), e, "failed reading");
105 		} catch (NullPointerException e) {
106 			curi.addLocalizedError(getName(), e, "bad .swf file");
107 		} catch (NegativeArraySizeException e) {
108 			curi.addLocalizedError(getName(), e, "bad .swf file");
109 		}
110 
111 		// Set flag to indicate that link extraction is completed.
112 		curi.linkExtractorFinished();
113 		logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
114 
115 	}
116 
117 	/***
118 	 * Get a link extracting SWFParser.
119 	 * 
120 	 * A custom SWFReader which parses links from .swf file.
121 	 * 
122 	 * @param curi A CrawlURI to be processed.
123 	 * @return An SWFReader.
124 	 */
125 	private SWFReader getSWFReader(CrawlURI curi) {
126 
127 		InputStream documentStream = null;
128 		// Get the SWF file's content stream.
129 		try {
130 			documentStream = curi.getHttpRecorder().getRecordedInput()
131 					.getContentReplayInputStream();
132 			if (documentStream == null) {
133 				return null;
134 			}
135 		} catch (IOException e) {
136 			curi.addLocalizedError(getName(), e, "Fail reading.");
137 		} finally {
138 			try {
139 				documentStream.close();
140 			} catch (IOException e) {
141 				curi.addLocalizedError(getName(), e, "Fail on close.");
142 			}
143 		}
144 
145 		// Create SWF actions that will add discoved URIs to CrawlURI
146 		// alist(s).
147 		ExtractorSWFActions actions = new ExtractorSWFActions(curi,
148 				getController());
149 		// Overwrite parsing of specific tags that might have URIs.
150 		ExtractorSWFTags tags = new ExtractorSWFTags(actions);
151 		// Get a SWFReader instance.
152 		SWFReader reader = new SWFReader(getTagParser(tags), documentStream) {
153 			/***
154 			 * Override because a corrupt SWF file can cause us to try read
155 			 * lengths that are hundreds of megabytes in size causing us to
156 			 * OOME.
157 			 * 
158 			 * Below is copied from SWFReader parent class.
159 			 */
160 			public int readOneTag() throws IOException {
161 				int header = mIn.readUI16();
162 				int type = header >> 6; // only want the top 10 bits
163 				int length = header & 0x3F; // only want the bottom 6 bits
164 				boolean longTag = (length == 0x3F);
165 				if (longTag) {
166 					length = (int) mIn.readUI32();
167 				}
168 				// Below test added for Heritrix use.
169 				if (length > MAX_READ_SIZE) {
170 					// skip to next, rather than throw IOException ending
171 					// processing
172 					mIn.skipBytes(length);
173 					logger.info("oversized SWF tag (type=" + type + ";length="
174 							+ length + ") skipped");
175 				} else {
176 					byte[] contents = mIn.read(length);
177 					mConsumer.tag(type, longTag, contents);
178 				}
179 				return type;
180 			}
181 		};
182 		return reader;
183 	}
184 
185 	/***
186 	 * Get a TagParser
187 	 * 
188 	 * A custom ExtractorTagParser which ignores all the big binary image/
189 	 * sound/font types which don't carry URLs is used, to avoid the
190 	 * occasionally fatal (OutOfMemoryError) memory bloat caused by the
191 	 * all-in-memory SWF library handling.
192 	 * 
193 	 * @param customTags
194 	 *            A custom tag parser.
195 	 * @return An SWFReader.
196 	 */
197 	private TagParser getTagParser(SWFTagTypes customTags) {
198 		return new ExtractorTagParser(customTags);
199 	}
200 
201 	/***
202 	 * TagParser customized to ignore SWFTags that will never contain
203 	 * extractable URIs.
204 	 */
205 	protected class ExtractorTagParser extends TagParser {
206 
207 		protected ExtractorTagParser(SWFTagTypes tagtypes) {
208 			super(tagtypes);
209 		}
210 
211 		protected void parseDefineBits(InStream in) throws IOException {
212 			// DO NOTHING - no URLs to be found in bits
213 		}
214 
215 		protected void parseDefineBitsJPEG3(InStream in) throws IOException {
216 			// DO NOTHING - no URLs to be found in bits
217 		}
218 
219 		protected void parseDefineBitsLossless(InStream in, int length,
220 				boolean hasAlpha) throws IOException {
221 			// DO NOTHING - no URLs to be found in bits
222 		}
223 
224 		protected void parseDefineButtonSound(InStream in) throws IOException {
225 			// DO NOTHING - no URLs to be found in sound
226 		}
227 
228 		protected void parseDefineFont(InStream in) throws IOException {
229 			// DO NOTHING - no URLs to be found in font
230 		}
231 
232 		protected void parseDefineJPEG2(InStream in, int length)
233 				throws IOException {
234 			// DO NOTHING - no URLs to be found in jpeg
235 		}
236 
237 		protected void parseDefineJPEGTables(InStream in) throws IOException {
238 			// DO NOTHING - no URLs to be found in jpeg
239 		}
240 
241 		protected void parseDefineShape(int type, InStream in)
242 				throws IOException {
243 			// DO NOTHING - no URLs to be found in shape
244 		}
245 
246 		protected void parseDefineSound(InStream in) throws IOException {
247 			// DO NOTHING - no URLs to be found in sound
248 		}
249 
250 		protected void parseFontInfo(InStream in, int length, boolean isFI2)
251 				throws IOException {
252 			// DO NOTHING - no URLs to be found in font info
253 		}
254 
255 		protected void parseDefineFont2(InStream in) throws IOException {
256 			// DO NOTHING - no URLs to be found in bits
257 		}
258 	}
259 
260 	/***
261 	 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which
262 	 * parse URI-like strings.
263 	 */
264 	protected class ExtractorSWFTags extends SWFTagTypesImpl {
265 
266 		private SWFActions actions;
267 
268 		public ExtractorSWFTags(SWFActions acts) {
269 			super(null);
270 			actions = acts;
271 		}
272 
273 		public SWFActions tagDefineButton(int id, Vector buttonRecords)
274 				throws IOException {
275 
276 			return actions;
277 		}
278 
279 		public SWFActions tagDefineButton2(int id, boolean trackAsMenu,
280 				Vector buttonRecord2s) throws IOException {
281 
282 			return actions;
283 		}
284 
285 		public SWFActions tagDoAction() throws IOException {
286 			return actions;
287 		}
288 
289 		public SWFActions tagDoInActions(int spriteId) throws IOException {
290 			return actions;
291 		}
292 
293 		public SWFTagTypes tagDefineSprite(int id) throws IOException {
294 			return this;
295 		}
296 
297 		public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,
298 				int depth, int charId, Matrix matrix, AlphaTransform cxform,
299 				int ratio, String name, int clipActionFlags) throws IOException {
300 
301 			return actions;
302 		}
303 
304 	}
305 
306 	/***
307 	 * SWFActions that parse URI-like strings. Links discovered using
308 	 * <code>ExtractorJS</code> are marked as speculative links (hop X). All
309 	 * other links are marked as embedded links (hop E).
310 	 * 
311 	 */
312 	protected class ExtractorSWFActions extends SWFActionsImpl {
313 
314 		private CrawlURI curi;
315 
316 		private CrawlController controller;
317 
318 		static final String JSSTRING = "javascript:";
319 
320 		/***
321 		 * @param curi
322 		 *            SWF URL to handle
323 		 * @param controller
324 		 *            Crawl controller need for error reporting
325 		 */
326 		public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {
327 			assert (curi != null) : "CrawlURI should not be null";
328 			this.curi = curi;
329 			this.controller = controller;
330 		}
331 
332 		/***
333 		 * Overwrite handling of discovered URIs.
334 		 * 
335 		 * @param url
336 		 *            Discovered URL.
337 		 * @param target
338 		 *            Discovered target (currently not being used.)
339 		 * @throws IOException
340 		 */
341 		public void getURL(String url, String target) throws IOException {
342 			// I have done tests on a few tens of swf files and have not seen a
343 			// need
344 			// to use 'target.' Most of the time 'target' is not set, or it is
345 			// set
346 			// to '_self' or '_blank'.
347 			processURIString(url);
348 		}
349 
350 		public void lookupTable(String[] strings) throws IOException {
351 			for (String str : strings) {
352 				considerStringAsUri(str);
353 			}
354 		}
355 
356 		public void push(String value) throws IOException {
357 			considerStringAsUri(value);
358 		}
359 
360 		public void considerStringAsUri(String str) throws IOException {
361 			Matcher uri = TextUtils.getMatcher(ExtractorJS.STRING_URI_DETECTOR,
362 					str);
363 
364 			if (uri.matches()) {
365 				curi.createAndAddLinkRelativeToVia(uri.group(),
366 						Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);
367 				incrementLinkCount(1);
368 			}
369 			TextUtils.recycleMatcher(uri);
370 		}
371 
372 		public void processURIString(String url) throws IOException {
373 			if (url.startsWith(JSSTRING)) {
374 				incrementLinkCount(ExtractorJS.considerStrings(
375 						curi, url, controller,false));
376 			} else {
377 				curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
378 						Link.EMBED_HOP);
379 				incrementLinkCount(1);
380 			}
381 		}
382 
383 		private void incrementLinkCount(long count) {
384 			numberOfLinksExtracted += count;
385 		}
386 	}
387 
388 	public String report() {
389 		StringBuffer ret = new StringBuffer();
390 		ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
391 		ret.append("  Function:          Link extraction on Shockwave Flash "
392 				+ "documents (.swf)\n");
393 
394 		ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
395 		ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
396 		return ret.toString();
397 	}
398 
399 }