View Javadoc

1   /* RobotstxtTest
2    *
3    * $Id: RobotstxtTest.java 4668 2006-09-26 21:49:01Z paul_jack $
4    *
5    * Created Sep 1, 2005
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.datamodel;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.io.StringReader;
30  import java.util.HashMap;
31  import java.util.LinkedList;
32  import java.util.List;
33  
34  import junit.framework.TestCase;
35  
36  public class RobotstxtTest extends TestCase {
37      public void testParseRobots() throws IOException {
38          LinkedList<String> userAgents = new LinkedList<String>();
39          HashMap<String,List<String>> disallows
40           = new HashMap<String,List<String>>();
41          BufferedReader reader = new BufferedReader(new StringReader("BLAH"));
42          assertFalse(Robotstxt.parse(reader, userAgents, disallows));
43          assertTrue(disallows.size() == 0);
44          // Parse archive robots.txt with heritrix agent.
45          String agent = "archive.org_bot";
46          reader = new BufferedReader(
47              new StringReader("User-agent: " + agent + "\n" +
48              "Disallow: /cgi-bin/\n" +
49              "Disallow: /details/software\n"));
50          assertFalse(Robotstxt.parse(reader, userAgents, disallows));
51          assertTrue(disallows.size() == 1);
52          assertTrue(userAgents.size() == 1);
53          assertEquals(userAgents.get(0), agent);
54          // Parse archive robots.txt with star agent.
55          agent = "*";
56          reader = new BufferedReader(
57              new StringReader("User-agent: " + agent + "\n" +
58              "Disallow: /cgi-bin/\n" +
59              "Disallow: /details/software\n"));
60          disallows = new HashMap<String,List<String>>();
61          userAgents = new LinkedList<String>();
62          assertFalse(Robotstxt.parse(reader, userAgents, disallows));
63          assertTrue(disallows.size() == 1);
64          assertTrue(userAgents.size() == 1);
65          assertEquals(userAgents.get(0), "");
66      }
67  }