1   /* UriProcessingFormatter.java
2    *
3    * $Id: UriProcessingFormatter.java 4964 2007-03-08 06:56:46Z gojomo $
4    * 
5    * Created on Jun 10, 2003
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.io;
26  
27  import it.unimi.dsi.mg4j.util.MutableString;
28  
29  import java.util.logging.Formatter;
30  import java.util.logging.LogRecord;
31  
32  import org.archive.crawler.datamodel.CoreAttributeConstants;
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.util.ArchiveUtils;
35  import org.archive.util.Base32;
36  import org.archive.util.MimetypeUtils;
37  
38  /***
39   * Formatter for 'crawl.log'. Expects completed CrawlURI as parameter.
40   *
41   * @author gojomo
42   */
43  public class UriProcessingFormatter
44  extends Formatter implements CoreAttributeConstants {
45      private final static String NA = "-";
46      /***
47       * Guess at line length (URIs are assumed avg. of 128 bytes).
48       * Used to preallocated the buffer we accumulate the log line
49       * in.  Hopefully we get it right most of the time and no need
50       * to enlarge except in the rare case.
51       */
52      private final static int GUESS_AT_LOG_LENGTH =
53          17 + 1 + 3 + 1 + 10 + 128 + + 1 + 10 + 1 + 128 + 1 + 10 + 1 + 3 +
54          14 + 1 + 32 + 4 + 128 + 1;
55      
56      /***
57       * Reuseable assembly buffer.
58       */
59      private final MutableString buffer =
60          new MutableString(GUESS_AT_LOG_LENGTH);
61      
62      public String format(LogRecord lr) {
63          CrawlURI curi = (CrawlURI)lr.getParameters()[0];
64          String length = NA;
65          String mime = null;
66          if (curi.isHttpTransaction()) {
67              if(curi.getContentLength() >= 0) {
68                  length = Long.toString(curi.getContentLength());
69              } else if (curi.getContentSize() > 0) {
70                  length = Long.toString(curi.getContentSize());
71              }
72              mime = curi.getContentType();
73          } else {
74              if (curi.getContentSize() > 0) {
75                  length = Long.toString(curi.getContentSize());
76              } 
77              mime = curi.getContentType();
78          }
79          mime = MimetypeUtils.truncate(mime);
80  
81          long time = System.currentTimeMillis();
82          String arcTimeAndDuration;
83          if(curi.containsKey(A_FETCH_COMPLETED_TIME)) {
84              long completedTime = curi.getLong(A_FETCH_COMPLETED_TIME);
85              long beganTime = curi.getLong(A_FETCH_BEGAN_TIME);
86              arcTimeAndDuration = ArchiveUtils.get17DigitDate(beganTime) + "+"
87                      + Long.toString(completedTime - beganTime);
88          } else {
89              arcTimeAndDuration = NA;
90          }
91  
92          String via = curi.flattenVia();
93          
94          String digest = curi.getContentDigestSchemeString();
95  
96          String sourceTag = curi.containsKey(A_SOURCE_TAG) 
97                  ? curi.getString(A_SOURCE_TAG)
98                  : null;
99                  
100         this.buffer.length(0);
101         return this.buffer.append(ArchiveUtils.getLog17Date(time))
102             .append(" ")
103             .append(ArchiveUtils.padTo(curi.getFetchStatus(), 5))
104             .append(" ")
105             .append(ArchiveUtils.padTo(length, 10))
106             .append(" ")
107             .append(curi.getUURI().toString())
108             .append(" ")
109             .append(checkForNull(curi.getPathFromSeed()))
110             .append(" ")
111             .append(checkForNull(via))
112             .append(" ")
113             .append(mime)
114             .append(" ")
115             .append("#")
116             // Pad threads to be 3 digits.  For Igor.
117             .append(ArchiveUtils.padTo(
118                 Integer.toString(curi.getThreadNumber()), 3, '0'))
119             .append(" ")
120             .append(arcTimeAndDuration)
121             .append(" ")
122             .append(checkForNull(digest))
123             .append(" ")
124             .append(checkForNull(sourceTag))
125             .append(" ")
126             .append(checkForNull(curi.getAnnotations()))
127             .append("\n").toString();
128     }
129     
130     /***
131      * @param str String to check.
132      * @return Return passed string or <code>NA</code> if null.
133      */
134     protected String checkForNull(String str) {
135         return (str == null || str.length() <= 0)? NA: str;
136     }
137 }
138 
139