1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.frontier;
24
25 import java.io.UnsupportedEncodingException;
26 import java.math.BigInteger;
27 import java.util.ArrayList;
28 import java.util.List;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31 import java.util.regex.Pattern;
32
33 import org.apache.commons.collections.Closure;
34 import org.archive.crawler.datamodel.CrawlURI;
35 import org.archive.crawler.framework.FrontierMarker;
36 import org.archive.util.ArchiveUtils;
37
38 import com.sleepycat.bind.serial.StoredClassCatalog;
39 import com.sleepycat.je.Cursor;
40 import com.sleepycat.je.Database;
41 import com.sleepycat.je.DatabaseConfig;
42 import com.sleepycat.je.DatabaseEntry;
43 import com.sleepycat.je.DatabaseException;
44 import com.sleepycat.je.DatabaseNotFoundException;
45 import com.sleepycat.je.Environment;
46 import com.sleepycat.je.OperationStatus;
47 import com.sleepycat.util.RuntimeExceptionWrapper;
48
49
50 /***
51 * A BerkeleyDB-database-backed structure for holding ordered
52 * groupings of CrawlURIs. Reading the groupings from specific
53 * per-grouping (per-classKey/per-Host) starting points allows
54 * this to act as a collection of independent queues.
55 *
56 * <p>For how the bdb keys are made, see {@link #calculateInsertKey(CrawlURI)}.
57 *
58 * <p>TODO: refactor, improve naming.
59 *
60 * @author gojomo
61 */
62 public class BdbMultipleWorkQueues {
63 private static final long serialVersionUID = ArchiveUtils
64 .classnameBasedUID(BdbMultipleWorkQueues.class, 1);
65
66 private static final Logger LOGGER =
67 Logger.getLogger(BdbMultipleWorkQueues.class.getName());
68
69 /*** Database holding all pending URIs, grouped in virtual queues */
70 private Database pendingUrisDB = null;
71
72 /*** Supporting bdb serialization of CrawlURIs */
73 private RecyclingSerialBinding crawlUriBinding;
74
75 /***
76 * Create the multi queue in the given environment.
77 *
78 * @param env bdb environment to use
79 * @param classCatalog Class catalog to use.
80 * @param recycle True if we are to reuse db content if any.
81 * @throws DatabaseException
82 */
83 public BdbMultipleWorkQueues(Environment env,
84 StoredClassCatalog classCatalog, final boolean recycle)
85 throws DatabaseException {
86
87 DatabaseConfig dbConfig = new DatabaseConfig();
88 dbConfig.setAllowCreate(true);
89 if (!recycle) {
90 try {
91 env.truncateDatabase(null, "pending", false);
92 } catch (DatabaseNotFoundException e) {
93
94 }
95 }
96
97
98 dbConfig.setDeferredWrite(true);
99
100 this.pendingUrisDB = env.openDatabase(null, "pending", dbConfig);
101 crawlUriBinding =
102 new RecyclingSerialBinding(classCatalog, CrawlURI.class);
103 }
104
105 /***
106 * Delete all CrawlURIs matching the given expression.
107 *
108 * @param match
109 * @param queue
110 * @param headKey
111 * @return count of deleted items
112 * @throws DatabaseException
113 * @throws DatabaseException
114 */
115 public long deleteMatchingFromQueue(String match, String queue,
116 DatabaseEntry headKey) throws DatabaseException {
117 long deletedCount = 0;
118 Pattern pattern = Pattern.compile(match);
119 DatabaseEntry key = headKey;
120 DatabaseEntry value = new DatabaseEntry();
121 Cursor cursor = null;
122 try {
123 cursor = pendingUrisDB.openCursor(null, null);
124 OperationStatus result = cursor.getSearchKeyRange(headKey,
125 value, null);
126
127 while (result == OperationStatus.SUCCESS) {
128 if(value.getData().length>0) {
129 CrawlURI curi = (CrawlURI) crawlUriBinding
130 .entryToObject(value);
131 if (!curi.getClassKey().equals(queue)) {
132
133 break;
134 }
135 if (pattern.matcher(curi.toString()).matches()) {
136 cursor.delete();
137 deletedCount++;
138 }
139 }
140 result = cursor.getNext(key, value, null);
141 }
142 } finally {
143 if (cursor != null) {
144 cursor.close();
145 }
146 }
147
148 return deletedCount;
149 }
150
151 /***
152 * @param m marker
153 * @param maxMatches
154 * @return list of matches starting from marker position
155 * @throws DatabaseException
156 */
157 public List getFrom(FrontierMarker m, int maxMatches) throws DatabaseException {
158 int matches = 0;
159 int tries = 0;
160 ArrayList<CrawlURI> results = new ArrayList<CrawlURI>(maxMatches);
161 BdbFrontierMarker marker = (BdbFrontierMarker) m;
162
163 DatabaseEntry key = marker.getStartKey();
164 DatabaseEntry value = new DatabaseEntry();
165
166 if (key != null) {
167 Cursor cursor = null;
168 OperationStatus result = null;
169 try {
170 cursor = pendingUrisDB.openCursor(null,null);
171
172
173
174 result = cursor.getSearchKey(key, value, null);
175
176 while(matches<maxMatches && result == OperationStatus.SUCCESS) {
177 if(value.getData().length>0) {
178 CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
179 if(marker.accepts(curi)) {
180 results.add(curi);
181 matches++;
182 }
183 tries++;
184 }
185 result = cursor.getNext(key,value,null);
186 }
187 } finally {
188 if (cursor !=null) {
189 cursor.close();
190 }
191 }
192
193 if(result != OperationStatus.SUCCESS) {
194
195 marker.setStartKey(null);
196 }
197 }
198 return results;
199 }
200
201 /***
202 * Get a marker for beginning a scan over all contents
203 *
204 * @param regexpr
205 * @return a marker pointing to the first item
206 */
207 public FrontierMarker getInitialMarker(String regexpr) {
208 try {
209 return new BdbFrontierMarker(getFirstKey(), regexpr);
210 } catch (DatabaseException e) {
211 e.printStackTrace();
212 return null;
213 }
214 }
215
216 /***
217 * @return the key to the first item in the database
218 * @throws DatabaseException
219 */
220 protected DatabaseEntry getFirstKey() throws DatabaseException {
221 DatabaseEntry key = new DatabaseEntry();
222 DatabaseEntry value = new DatabaseEntry();
223 Cursor cursor = pendingUrisDB.openCursor(null,null);
224 OperationStatus status = cursor.getNext(key,value,null);
225 cursor.close();
226 if(status == OperationStatus.SUCCESS) {
227 return key;
228 }
229 return null;
230 }
231
232 /***
233 * Get the next nearest item after the given key. Relies on
234 * external discipline -- we'll look at the queues count of how many
235 * items it has -- to avoid asking for something from a
236 * range where there are no associated items --
237 * otherwise could get first item of next 'queue' by mistake.
238 *
239 * <p>TODO: hold within a queue's range
240 *
241 * @param headKey Key prefix that demarks the beginning of the range
242 * in <code>pendingUrisDB</code> we're interested in.
243 * @return CrawlURI.
244 * @throws DatabaseException
245 */
246 public CrawlURI get(DatabaseEntry headKey)
247 throws DatabaseException {
248 DatabaseEntry result = new DatabaseEntry();
249
250
251
252
253
254
255
256
257
258 OperationStatus status = getNextNearestItem(headKey, result);
259 CrawlURI retVal = null;
260 if (status != OperationStatus.SUCCESS) {
261 LOGGER.severe("See '1219854 NPE je-2.0 "
262 + "entryToObject...'. OperationStatus "
263 + " was not SUCCESS: "
264 + status
265 + ", headKey "
266 + BdbWorkQueue.getPrefixClassKey(headKey.getData()));
267 return null;
268 }
269 try {
270 retVal = (CrawlURI)crawlUriBinding.entryToObject(result);
271 } catch (RuntimeExceptionWrapper rw) {
272 LOGGER.log(
273 Level.SEVERE,
274 "expected object missing in queue " +
275 BdbWorkQueue.getPrefixClassKey(headKey.getData()),
276 rw);
277 return null;
278 }
279 retVal.setHolderKey(headKey);
280 return retVal;
281 }
282
283 protected OperationStatus getNextNearestItem(DatabaseEntry headKey,
284 DatabaseEntry result) throws DatabaseException {
285 Cursor cursor = null;
286 OperationStatus status;
287 try {
288 cursor = this.pendingUrisDB.openCursor(null, null);
289
290
291 status = cursor.getSearchKey(headKey, result, null);
292 if(status!=OperationStatus.SUCCESS || result.getData().length > 0) {
293
294 throw new DatabaseException("bdb queue cap missing");
295 }
296
297 status = cursor.getNext(headKey,result,null);
298 } finally {
299 if(cursor!=null) {
300 cursor.close();
301 }
302 }
303 return status;
304 }
305
306 /***
307 * Put the given CrawlURI in at the appropriate place.
308 *
309 * @param curi
310 * @throws DatabaseException
311 */
312 public void put(CrawlURI curi, boolean overwriteIfPresent)
313 throws DatabaseException {
314 DatabaseEntry insertKey = (DatabaseEntry)curi.getHolderKey();
315 if (insertKey == null) {
316 insertKey = calculateInsertKey(curi);
317 curi.setHolderKey(insertKey);
318 }
319 DatabaseEntry value = new DatabaseEntry();
320 crawlUriBinding.objectToEntry(curi, value);
321
322 if (LOGGER.isLoggable(Level.FINE)) {
323 tallyAverageEntrySize(curi, value);
324 }
325 OperationStatus status;
326 if(overwriteIfPresent) {
327 status = pendingUrisDB.put(null, insertKey, value);
328 } else {
329 status = pendingUrisDB.putNoOverwrite(null, insertKey, value);
330 }
331 if(status!=OperationStatus.SUCCESS) {
332 LOGGER.severe("failed; "+status+ " "+curi);
333 }
334 }
335
336 private long entryCount = 0;
337 private long entrySizeSum = 0;
338 private int largestEntry = 0;
339
340 /***
341 * Log average size of database entry.
342 * @param curi CrawlURI this entry is for.
343 * @param value Database entry value.
344 */
345 private synchronized void tallyAverageEntrySize(CrawlURI curi,
346 DatabaseEntry value) {
347 entryCount++;
348 int length = value.getData().length;
349 entrySizeSum += length;
350 int avg = (int) (entrySizeSum/entryCount);
351 if(entryCount % 1000 == 0) {
352 LOGGER.fine("Average entry size at "+entryCount+": "+avg);
353 }
354 if (length>largestEntry) {
355 largestEntry = length;
356 LOGGER.fine("Largest entry: "+length+" "+curi);
357 if(length>(2*avg)) {
358 LOGGER.fine("excessive?");
359 }
360 }
361 }
362
363 /***
364 * Calculate the 'origin' key for a virtual queue of items
365 * with the given classKey. This origin key will be a
366 * prefix of the keys for all items in the queue.
367 *
368 * @param classKey String key to derive origin byte key from
369 * @return a byte array key
370 */
371 static byte[] calculateOriginKey(String classKey) {
372 byte[] classKeyBytes = null;
373 int len = 0;
374 try {
375 classKeyBytes = classKey.getBytes("UTF-8");
376 len = classKeyBytes.length;
377 } catch (UnsupportedEncodingException e) {
378
379 e.printStackTrace();
380 }
381 byte[] keyData = new byte[len+1];
382 System.arraycopy(classKeyBytes,0,keyData,0,len);
383 keyData[len]=0;
384 return keyData;
385 }
386
387 /***
388 * Calculate the insertKey that places a CrawlURI in the
389 * desired spot. First bytes are always classKey (usu. host)
390 * based -- ensuring grouping by host -- terminated by a zero
391 * byte. Then 8 bytes of data ensuring desired ordering
392 * within that 'queue' are used. The first byte of these 8 is
393 * priority -- allowing 'immediate' and 'soon' items to
394 * sort above regular. Next 1 byte is 'cost'. Last 6 bytes
395 * are ordinal serial number, ensuring earlier-discovered
396 * URIs sort before later.
397 *
398 * NOTE: Dangers here are:
399 * (1) priorities or costs over 2^7 (signed byte comparison)
400 * (2) ordinals over 2^48
401 *
402 * Package access & static for testing purposes.
403 *
404 * @param curi
405 * @return a DatabaseEntry key for the CrawlURI
406 */
407 static DatabaseEntry calculateInsertKey(CrawlURI curi) {
408 byte[] classKeyBytes = null;
409 int len = 0;
410 try {
411 classKeyBytes = curi.getClassKey().getBytes("UTF-8");
412 len = classKeyBytes.length;
413 } catch (UnsupportedEncodingException e) {
414
415 e.printStackTrace();
416 }
417 byte[] keyData = new byte[len+9];
418 System.arraycopy(classKeyBytes,0,keyData,0,len);
419 keyData[len]=0;
420 long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL;
421 ordinalPlus =
422 ((long)curi.getSchedulingDirective() << 56) | ordinalPlus;
423 ordinalPlus =
424 ((((long)curi.getHolderCost()) & 0xFFL) << 48) | ordinalPlus;
425 ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len+1);
426 return new DatabaseEntry(keyData);
427 }
428
429 /***
430 * Delete the given CrawlURI from persistent store. Requires
431 * the key under which it was stored be available.
432 *
433 * @param item
434 * @throws DatabaseException
435 */
436 public void delete(CrawlURI item) throws DatabaseException {
437 OperationStatus status;
438 status = pendingUrisDB.delete(null, (DatabaseEntry) item.getHolderKey());
439 if (status != OperationStatus.SUCCESS) {
440 LOGGER.severe("expected item not present: "
441 + item
442 + "("
443 + (new BigInteger(((DatabaseEntry) item.getHolderKey())
444 .getData())).toString(16) + ")");
445 }
446
447 }
448
449 /***
450 * Method used by BdbFrontier during checkpointing.
451 * <p>The backing bdbje database has been marked deferred write so we save
452 * on writes to disk. Means no guarantees disk will have whats in memory
453 * unless a sync is called (Calling sync on the bdbje Environment is not
454 * sufficent).
455 * <p>Package access only because only Frontiers of this package would ever
456 * need access.
457 * @see <a href="http://www.sleepycat.com/jedocs/GettingStartedGuide/DB.html">Deferred Write Databases</a>
458 */
459 void sync() {
460 if (this.pendingUrisDB == null) {
461 return;
462 }
463 try {
464 this.pendingUrisDB.sync();
465 } catch (DatabaseException e) {
466 e.printStackTrace();
467 }
468 }
469
470 /***
471 * clean up
472 *
473 */
474 public void close() {
475 try {
476 this.pendingUrisDB.close();
477 } catch (DatabaseException e) {
478 e.printStackTrace();
479 }
480 }
481
482 /***
483 * Marker for remembering a position within the BdbMultipleWorkQueues.
484 *
485 * @author gojomo
486 */
487 public class BdbFrontierMarker implements FrontierMarker {
488 DatabaseEntry startKey;
489 Pattern pattern;
490 int nextItemNumber;
491
492 /***
493 * Create a marker pointed at the given start location.
494 *
495 * @param startKey
496 * @param regexpr
497 */
498 public BdbFrontierMarker(DatabaseEntry startKey, String regexpr) {
499 this.startKey = startKey;
500 pattern = Pattern.compile(regexpr);
501 nextItemNumber = 1;
502 }
503
504 /***
505 * @param curi
506 * @return whether the marker accepts the given CrawlURI
507 */
508 public boolean accepts(CrawlURI curi) {
509 boolean retVal = pattern.matcher(curi.toString()).matches();
510 if(retVal==true) {
511 nextItemNumber++;
512 }
513 return retVal;
514 }
515
516 /***
517 * @param key position for marker
518 */
519 public void setStartKey(DatabaseEntry key) {
520 startKey = key;
521 }
522
523 /***
524 * @return startKey
525 */
526 public DatabaseEntry getStartKey() {
527 return startKey;
528 }
529
530
531
532
533 public String getMatchExpression() {
534 return pattern.pattern();
535 }
536
537
538
539
540 public long getNextItemNumber() {
541 return nextItemNumber;
542 }
543
544
545
546
547 public boolean hasNext() {
548
549 return startKey != null;
550 }
551 }
552
553 /***
554 * Add a dummy 'cap' entry at the given insertion key. Prevents
555 * 'seeks' to queue heads from holding lock on last item of
556 * 'preceding' queue. See:
557 * http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102
558 *
559 * @param origin key at which to insert the cap
560 */
561 public void addCap(byte[] origin) {
562 try {
563 pendingUrisDB.put(null, new DatabaseEntry(origin),
564 new DatabaseEntry(new byte[0]));
565 } catch (DatabaseException e) {
566 throw new RuntimeException(e);
567 }
568 }
569
570
571 /***
572 * Utility method to perform action for all pending CrawlURI instances.
573 * @param c Closure action to perform
574 * @throws DatabaseException
575 */
576 protected void forAllPendingDo(Closure c) throws DatabaseException {
577 DatabaseEntry key = new DatabaseEntry();
578 DatabaseEntry value = new DatabaseEntry();
579 Cursor cursor = pendingUrisDB.openCursor(null,null);
580 while(cursor.getNext(key,value,null)==OperationStatus.SUCCESS) {
581 if(value.getData().length==0) {
582 continue;
583 }
584 CrawlURI item = (CrawlURI)crawlUriBinding.entryToObject(value);
585 c.execute(item);
586 }
587 }
588 }