1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.LinkedList;
31 import java.util.List;
32 import java.util.Map;
33
34 /***
35 * Utility class for parsing 'robots.txt' format directives, into a list
36 * of named user-agents and map from user-agents to disallowed paths.
37 */
38 public class Robotstxt {
39 public static boolean parse(BufferedReader reader,
40 final LinkedList<String> userAgents,
41 final Map<String,List<String>> disallows)
42 throws IOException {
43 boolean hasErrors = false;
44 String read;
45
46 ArrayList<String> current = null;
47
48 boolean hasDirectivesYet = false;
49 String catchall = null;
50 while (reader != null) {
51 do {
52 read = reader.readLine();
53
54 } while ((read != null) && ((read = read.trim()).startsWith("#") ||
55 read.length() == 0));
56 if (read == null) {
57 reader.close();
58 reader = null;
59 } else {
60 int commentIndex = read.indexOf("#");
61 if (commentIndex > -1) {
62
63 read = read.substring(0, commentIndex);
64 }
65 read = read.trim();
66 if (read.matches("(?i)^User-agent:.*")) {
67 String ua = read.substring(11).trim().toLowerCase();
68 if (current == null || hasDirectivesYet ) {
69
70
71 current = new ArrayList<String>();
72 hasDirectivesYet = false;
73 }
74 if (ua.equals("*")) {
75 ua = "";
76 catchall = ua;
77 } else {
78 userAgents.addLast(ua);
79 }
80 disallows.put(ua, current);
81 continue;
82 }
83 if (read.matches("(?i)Disallow:.*")) {
84 if (current == null) {
85
86 hasErrors = true;
87 continue;
88 }
89 String path = read.substring(9).trim();
90 current.add(path);
91 hasDirectivesYet = true;
92 continue;
93 }
94 if (read.matches("(?i)Crawl-delay:.*")) {
95 if (current == null) {
96
97 hasErrors = true;
98 continue;
99 }
100
101
102
103 hasDirectivesYet = true;
104
105 continue;
106 }
107 if (read.matches("(?i)Allow:.*")) {
108 if (current == null) {
109
110 hasErrors = true;
111 continue;
112 }
113
114
115
116 hasDirectivesYet = true;
117
118 continue;
119 }
120
121 }
122 }
123
124 if (catchall != null) {
125 userAgents.addLast(catchall);
126 }
127 return hasErrors;
128 }
129
130 /***
131 * @param args Command-line arguments.
132 */
133 public static void main(String[] args) {
134
135 }
136 }