org.archive.crawler.writer
Class WARCWriterProcessor
java.lang.Object
javax.management.Attribute
org.archive.crawler.settings.Type
org.archive.crawler.settings.ComplexType
org.archive.crawler.settings.ModuleType
org.archive.crawler.framework.Processor
org.archive.crawler.framework.WriterPoolProcessor
org.archive.crawler.writer.WARCWriterProcessor
- All Implemented Interfaces:
- java.io.Serializable, javax.management.DynamicMBean, CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener, ArchiveFileConstants, WARCConstants, WriterPoolSettings
public class WARCWriterProcessor
- extends WriterPoolProcessor
- implements CoreAttributeConstants, CrawlStatusListener, WriterPoolSettings, FetchStatusCodes, WARCConstants
Experimental WARCWriterProcessor.
Goes against the 0.17 version of the WARC specification.
See http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc .
TODO: Remove ANVLRecord. Rename NameValue or use RFC822
(commons-httpclient?) or find something else.
- Author:
- stack
- See Also:
- Serialized Form
Fields inherited from class org.archive.crawler.framework.WriterPoolProcessor |
ANNOTATION_UNWRITTEN, ATTR_COMPRESS, ATTR_MAX_BYTES_WRITTEN, ATTR_MAX_SIZE_BYTES, ATTR_PATH, ATTR_POOL_MAX_ACTIVE, ATTR_POOL_MAX_WAIT, ATTR_PREFIX, ATTR_SKIP_IDENTICAL_DIGESTS, ATTR_SUFFIX, DEFAULT_COMPRESS |
Fields inherited from interface org.archive.crawler.datamodel.CoreAttributeConstants |
A_ANNOTATIONS, A_CONTENT_DIGEST, A_CONTENT_TYPE, A_CREDENTIAL_AVATARS_KEY, A_DELAY_FACTOR, A_DISTANCE_FROM_SEED, A_DNS_FETCH_TIME, A_DNS_SERVER_IP_LABEL, A_ETAG_HEADER, A_FETCH_BEGAN_TIME, A_FETCH_COMPLETED_TIME, A_FETCH_HISTORY, A_FORCE_RETIRE, A_HERITABLE_KEYS, A_HTML_BASE, A_HTTP_BIND_ADDRESS, A_HTTP_PROXY_HOST, A_HTTP_PROXY_PORT, A_HTTP_TRANSACTION, A_LAST_MODIFIED_HEADER, A_LOCALIZED_ERRORS, A_META_ROBOTS, A_MINIMUM_DELAY, A_MIRROR_PATH, A_PREREQUISITE_URI, A_REFERENCE_LENGTH, A_RETRY_DELAY, A_RRECORD_SET_LABEL, A_RUNTIME_EXCEPTION, A_SOURCE_TAG, A_STATUS, HEADER_TRUNC, LENGTH_TRUNC, TIMER_TRUNC, TRUNC_SUFFIX |
Fields inherited from interface org.archive.crawler.datamodel.FetchStatusCodes |
S_BLOCKED_BY_CUSTOM_PROCESSOR, S_BLOCKED_BY_QUOTA, S_BLOCKED_BY_RUNTIME_LIMIT, S_BLOCKED_BY_USER, S_CONNECT_FAILED, S_CONNECT_LOST, S_DEEMED_CHAFF, S_DEFERRED, S_DELETED_BY_USER, S_DNS_SUCCESS, S_DOMAIN_PREREQUISITE_FAILURE, S_DOMAIN_UNRESOLVABLE, S_GETBYNAME_SUCCESS, S_OTHER_PREREQUISITE_FAILURE, S_OUT_OF_SCOPE, S_PREREQUISITE_UNSCHEDULABLE_FAILURE, S_PROCESSING_THREAD_KILLED, S_ROBOTS_PRECLUDED, S_ROBOTS_PREREQUISITE_FAILURE, S_RUNTIME_EXCEPTION, S_SERIOUS_ERROR, S_TIMEOUT, S_TOO_MANY_EMBED_HOPS, S_TOO_MANY_LINK_HOPS, S_TOO_MANY_RETRIES, S_UNATTEMPTED, S_UNFETCHABLE_URI, S_UNQUEUEABLE |
Fields inherited from interface org.archive.io.warc.WARCConstants |
COLON_SPACE, COMPRESSED_WARC_FILE_EXTENSION, CONTENT_DESCRIPTION, CONTENT_LENGTH, CONTENT_TYPE, CONTINUATION, CONTINUATION_INDEX, CONVERSION, CONVERSION_INDEX, DEFAULT_ENCODING, DEFAULT_MAX_WARC_FILE_SIZE, DOT_COMPRESSED_FILE_EXTENSION, DOT_COMPRESSED_WARC_FILE_EXTENSION, DOT_WARC_FILE_EXTENSION, HEADER_FIELD_KEYS, HEADER_FIELD_SEPARATOR, HEADER_KEY_BLOCK_DIGEST, HEADER_KEY_CONCURRENT_TO, HEADER_KEY_DATE, HEADER_KEY_ETAG, HEADER_KEY_FILENAME, HEADER_KEY_ID, HEADER_KEY_IP, HEADER_KEY_LAST_MODIFIED, HEADER_KEY_PAYLOAD_DIGEST, HEADER_KEY_PROFILE, HEADER_KEY_TRUNCATED, HEADER_KEY_TYPE, HEADER_KEY_URI, HEADER_LINE_ENCODING, HTTP_REQUEST_MIMETYPE, HTTP_RESPONSE_MIMETYPE, MAX_LINE_LENGTH, MAX_WARC_HEADER_LINE_LENGTH, METADATA, METADATA_INDEX, NAMED_FIELD_CHECKSUM_LABEL, NAMED_FIELD_DESCRIPTION, NAMED_FIELD_FILEDESC, NAMED_FIELD_IP_LABEL, NAMED_FIELD_RELATED_LABEL, NAMED_FIELD_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_HEAD, NAMED_FIELD_TRUNCATED_VALUE_LENGTH, NAMED_FIELD_TRUNCATED_VALUE_TIME, NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED, NAMED_FIELD_WARCFILENAME, PLACEHOLDER_RECORD_LENGTH_STRING, PROFILE_REVISIT_IDENTICAL_DIGEST, PROFILE_REVISIT_NOT_MODIFIED, REQUEST, REQUEST_INDEX, RESOURCE, RESOURCE_INDEX, RESPONSE, RESPONSE_INDEX, REVISIT, REVISIT_INDEX, TRUNCATED_VALUE_UNSPECIFIED, TYPE, TYPES, TYPES_LIST, WARC_010_ID, WARC_010_MAGIC, WARC_FILE_EXTENSION, WARC_HEADER_ENCODING, WARC_ID, WARC_MAGIC, WARC_VERSION, WARCINFO, WARCINFO_INDEX, WSP |
Fields inherited from interface org.archive.io.ArchiveFileConstants |
ABSOLUTE_OFFSET_KEY, CDX, CDX_FILE, CDX_LINE_BUFFER_SIZE, COMPRESSED_FILE_EXTENSION, CRLF, DATE_FIELD_KEY, DEFAULT_DIGEST_METHOD, DUMP, GZIP_DUMP, HEADER, INVALID_SUFFIX, LENGTH_FIELD_KEY, MIMETYPE_FIELD_KEY, NOHEAD, OCCUPIED_SUFFIX, READER_IDENTIFIER_FIELD_KEY, RECORD_IDENTIFIER_FIELD_KEY, SINGLE_SPACE, TYPE_FIELD_KEY, URL_FIELD_KEY, VERSION_FIELD_KEY |
Method Summary |
protected void |
addIfNotBlank(ANVLRecord record,
java.lang.String label,
java.lang.String value)
|
protected java.lang.String[] |
getDefaultPath()
|
protected java.lang.String |
getFirstrecordBody(java.io.File orderFile)
Return relevant values as header-like fields (here ANVLRecord, but
spec-defined "application/warc-fields" type when written). |
protected java.lang.String |
getFirstrecordStylesheet()
|
protected java.net.URI |
getRecordID()
|
protected void |
innerProcess(CrawlURI curi)
Writes a CrawlURI and its associated data to store file. |
protected java.net.URI |
qualifyRecordID(java.net.URI base,
java.lang.String key,
java.lang.String value)
|
protected void |
saveHeader(java.lang.String origName,
org.apache.commons.httpclient.HttpMethodBase method,
ANVLRecord headers,
java.lang.String newName)
Save a header from the given HTTP operation into the
provider headers under a new name |
protected void |
setupPool(java.util.concurrent.atomic.AtomicInteger serialNo)
Set up pool of files. |
protected void |
write(java.lang.String lowerCaseScheme,
CrawlURI curi)
|
protected java.net.URI |
writeMetadata(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeRequest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeResource(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeResponse(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeRevisitDigest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
protected java.net.URI |
writeRevisitNotModified(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
|
Methods inherited from class org.archive.crawler.framework.WriterPoolProcessor |
cacheMetadata, checkBytesWritten, checkpointRecover, crawlCheckpoint, crawlEnded, crawlEnding, crawlPaused, crawlPausing, crawlResuming, crawlStarted, getAttributeUnchecked, getCheckpointStateFile, getHostAddress, getMaxSize, getMaxToWrite, getMetadata, getOutputDirs, getPool, getPoolMaximumActive, getPoolMaximumWait, getPrefix, getSerialNo, getSuffix, getTotalBytesWritten, initialTasks, isCompressed, loadCheckpointSerialNumber, saveCheckpointSerialNumber, setPool, setTotalBytesWritten, shouldWrite |
Methods inherited from class org.archive.crawler.framework.Processor |
checkForInterrupt, finalTasks, getController, getDecideRule, getDefaultNextProcessor, innerRejectProcess, isContentToProcess, isExpectedMimeType, isHttpTransactionContentToProcess, kickUpdate, process, report, rulesAccept, rulesAccept, setDefaultNextProcessor, spawn |
Methods inherited from class org.archive.crawler.settings.ComplexType |
addElementToDefinition, checkValue, earlyInitialize, getAbsoluteName, getAttribute, getAttribute, getAttribute, getAttributeInfo, getAttributeInfo, getAttributeInfoIterator, getAttributes, getDataContainerRecursive, getDataContainerRecursive, getDefaultValue, getDescription, getElementFromDefinition, getLegalValues, getLocalAttribute, getMBeanInfo, getMBeanInfo, getParent, getPreservedFields, getSettingsHandler, getUncheckedAttribute, getValue, globalSettings, invoke, isInitialized, isOverridden, iterator, removeElementFromDefinition, setAsOrder, setAttribute, setAttribute, setAttributes, setDescription, setPreservedFields, toString, unsetAttribute |
Methods inherited from class org.archive.crawler.settings.Type |
addConstraint, equals, getConstraints, getLegalValueType, isExpertSetting, isOverrideable, isTransient, setExpertSetting, setLegalValueType, setOverrideable, setTransient |
Methods inherited from class javax.management.Attribute |
getName |
Methods inherited from class java.lang.Object |
clone, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait |
ATTR_WRITE_REQUESTS
public static final java.lang.String ATTR_WRITE_REQUESTS
- Key for whether to write 'request' type records where possible
- See Also:
- Constant Field Values
ATTR_WRITE_METADATA
public static final java.lang.String ATTR_WRITE_METADATA
- Key for whether to write 'metadata' type records where possible
- See Also:
- Constant Field Values
ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS
public static final java.lang.String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS
- Key for whether to write 'revisit' type records when
consecutive identical digest
- See Also:
- Constant Field Values
ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED
public static final java.lang.String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED
- Key for whether to write 'revisit' type records for server
"304 not modified" responses
- See Also:
- Constant Field Values
WARCWriterProcessor
public WARCWriterProcessor(java.lang.String name)
- Parameters:
name
- Name of this writer.
getDefaultPath
protected java.lang.String[] getDefaultPath()
- Overrides:
getDefaultPath
in class WriterPoolProcessor
setupPool
protected void setupPool(java.util.concurrent.atomic.AtomicInteger serialNo)
- Description copied from class:
WriterPoolProcessor
- Set up pool of files.
- Specified by:
setupPool
in class WriterPoolProcessor
innerProcess
protected void innerProcess(CrawlURI curi)
- Writes a CrawlURI and its associated data to store file.
Currently this method understands the following uri types: dns, http, and
https.
- Specified by:
innerProcess
in class WriterPoolProcessor
- Parameters:
curi
- CrawlURI to process.
write
protected void write(java.lang.String lowerCaseScheme,
CrawlURI curi)
throws java.io.IOException
- Throws:
java.io.IOException
writeRequest
protected java.net.URI writeRequest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
throws java.io.IOException
- Throws:
java.io.IOException
writeResponse
protected java.net.URI writeResponse(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
throws java.io.IOException
- Throws:
java.io.IOException
writeResource
protected java.net.URI writeResource(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
throws java.io.IOException
- Throws:
java.io.IOException
writeRevisitDigest
protected java.net.URI writeRevisitDigest(WARCWriter w,
java.lang.String timestamp,
java.lang.String mimetype,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
throws java.io.IOException
- Throws:
java.io.IOException
writeRevisitNotModified
protected java.net.URI writeRevisitNotModified(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
throws java.io.IOException
- Throws:
java.io.IOException
saveHeader
protected void saveHeader(java.lang.String origName,
org.apache.commons.httpclient.HttpMethodBase method,
ANVLRecord headers,
java.lang.String newName)
- Save a header from the given HTTP operation into the
provider headers under a new name
- Parameters:
origName
- header name to get if presentmethod
- http operation containing headers
writeMetadata
protected java.net.URI writeMetadata(WARCWriter w,
java.lang.String timestamp,
java.net.URI baseid,
CrawlURI curi,
ANVLRecord namedFields)
throws java.io.IOException
- Throws:
java.io.IOException
getRecordID
protected java.net.URI getRecordID()
throws java.io.IOException
- Throws:
java.io.IOException
qualifyRecordID
protected java.net.URI qualifyRecordID(java.net.URI base,
java.lang.String key,
java.lang.String value)
throws java.io.IOException
- Throws:
java.io.IOException
getFirstrecordStylesheet
protected java.lang.String getFirstrecordStylesheet()
- Overrides:
getFirstrecordStylesheet
in class WriterPoolProcessor
getFirstrecordBody
protected java.lang.String getFirstrecordBody(java.io.File orderFile)
- Return relevant values as header-like fields (here ANVLRecord, but
spec-defined "application/warc-fields" type when written). Field
names from from DCMI Terms and the WARC/0.17 specification.
- Overrides:
getFirstrecordBody
in class WriterPoolProcessor
- Parameters:
orderFile
- Order file.
- Returns:
- String that holds the arc metaheader body.
- See Also:
WriterPoolProcessor.getFirstrecordBody(java.io.File)
addIfNotBlank
protected void addIfNotBlank(ANVLRecord record,
java.lang.String label,
java.lang.String value)
Copyright © 2003-2008 Internet Archive. All Rights Reserved.