1   /* Robots.java
2    *
3    * $Id: Robotstxt.java 4947 2007-03-01 04:47:24Z gojomo $
4    *
5    * Created Sep 1, 2005
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.datamodel;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.util.ArrayList;
30  import java.util.LinkedList;
31  import java.util.List;
32  import java.util.Map;
33  
34  /***
35   * Utility class for parsing 'robots.txt' format directives, into a list
36   * of named user-agents and map from user-agents to disallowed paths. 
37   */
38  public class Robotstxt {
39      public static boolean parse(BufferedReader reader,
40              final LinkedList<String> userAgents, 
41              final Map<String,List<String>> disallows)
42      throws IOException {
43          boolean hasErrors = false;
44          String read;
45          // current is the disallowed paths for the preceding User-Agent(s)
46          ArrayList<String> current = null;
47          // whether a non-'User-Agent' directive has been encountered
48          boolean hasDirectivesYet = false; 
49          String catchall = null;
50          while (reader != null) {
51              do {
52                  read = reader.readLine();
53                  // Skip comments & blanks
54              } while ((read != null) && ((read = read.trim()).startsWith("#") ||
55                  read.length() == 0));
56              if (read == null) {
57                  reader.close();
58                  reader = null;
59              } else {
60                  int commentIndex = read.indexOf("#");
61                  if (commentIndex > -1) {
62                      // Strip trailing comment
63                      read = read.substring(0, commentIndex);
64                  }
65                  read = read.trim();
66                  if (read.matches("(?i)^User-agent:.*")) {
67                      String ua = read.substring(11).trim().toLowerCase();
68                      if (current == null || hasDirectivesYet ) {
69                          // only create new rules-list if necessary
70                          // otherwise share with previous user-agent
71                          current = new ArrayList<String>();
72                          hasDirectivesYet = false; 
73                      }
74                      if (ua.equals("*")) {
75                          ua = "";
76                          catchall = ua;
77                      } else {
78                          userAgents.addLast(ua);
79                      }
80                      disallows.put(ua, current);
81                      continue;
82                  }
83                  if (read.matches("(?i)Disallow:.*")) {
84                      if (current == null) {
85                          // buggy robots.txt
86                          hasErrors = true;
87                          continue;
88                      }
89                      String path = read.substring(9).trim();
90                      current.add(path);
91                      hasDirectivesYet = true; 
92                      continue;
93                  }
94                  if (read.matches("(?i)Crawl-delay:.*")) {
95                      if (current == null) {
96                          // buggy robots.txt
97                          hasErrors = true;
98                          continue;
99                      }
100                     // consider a crawl-delay, even though we don't 
101                     // yet understand it, as sufficient to end a 
102                     // grouping of User-Agent lines
103                     hasDirectivesYet = true;
104                     // TODO: understand/save/respect 'Crawl-Delay' 
105                     continue;
106                 }
107                 if (read.matches("(?i)Allow:.*")) {
108                     if (current == null) {
109                         // buggy robots.txt
110                         hasErrors = true;
111                         continue;
112                     }
113                     // consider an Allow, even though we don't 
114                     // yet understand it, as sufficient to end a 
115                     // grouping of User-Agent lines
116                     hasDirectivesYet = true;
117                     // TODO: understand/save/respect 'Allow' 
118                     continue;
119                 }
120                 // unknown line; do nothing for now
121             }
122         }
123 
124         if (catchall != null) {
125             userAgents.addLast(catchall);
126         }
127         return hasErrors;
128     }
129 
130     /***
131      * @param args Command-line arguments.
132      */
133     public static void main(String[] args) {
134         // TODO Auto-generated method stub
135     }
136 }