View Javadoc

1   /* ServerCache
2    * 
3    * Created on Nov 19, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.datamodel;
24  
25  import java.util.Map;
26  import java.util.Hashtable;
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.apache.commons.collections.Closure;
31  import org.apache.commons.httpclient.URIException;
32  import org.archive.crawler.framework.CrawlController;
33  import org.archive.crawler.settings.SettingsHandler;
34  
35  /***
36   * Server and Host cache.
37   * @author stack
38   * @version $Date: 2007-08-28 05:15:25 +0000 (Tue, 28 Aug 2007) $, $Revision: 5439 $
39   */
40  public class ServerCache {
41      private static Logger logger =
42          Logger.getLogger(ServerCache.class.getName());
43      
44      protected SettingsHandler settingsHandler = null;
45      
46      /***
47       * hostname[:port] -> CrawlServer.
48       * Set in the initialization.
49       */
50      protected Map<String,CrawlServer> servers = null;
51      
52      /***
53       * hostname -> CrawlHost.
54       * Set in the initialization.
55       */
56      protected Map<String,CrawlHost> hosts = null;
57      
58      /***
59       * Constructor.
60       * Shutdown access to the default constructor by making it protected.
61       */
62      protected ServerCache() {
63          super();
64      }
65      
66      /***
67       * This constructor creates a ServerCache that is all memory-based using
68       * Hashtables.  Used for unit testing only
69       * (Use {@link #ServerCache(CrawlController)} when crawling).
70       * @param sh
71       * @throws Exception
72       */
73      public ServerCache(final SettingsHandler sh)
74      throws Exception {
75          this.settingsHandler = sh;
76          this.servers = new Hashtable<String,CrawlServer>();
77          this.hosts = new Hashtable<String,CrawlHost>();
78      }
79      
80      public ServerCache(final CrawlController c)
81      throws Exception {
82          this.settingsHandler = c.getSettingsHandler();
83          this.servers = c.getBigMap("servers", String.class, CrawlServer.class);
84          this.hosts = c.getBigMap("hosts", String.class, CrawlHost.class);
85      }
86      
87      /***
88       * Get the {@link CrawlServer} associated with <code>name</code>.
89       * @param serverKey Server name we're to return server for.
90       * @return CrawlServer instance that matches the passed server name.
91       */
92      public synchronized CrawlServer getServerFor(String serverKey) {
93          CrawlServer cserver = (CrawlServer)this.servers.get(serverKey);
94          return (cserver != null)? cserver: createServerFor(serverKey);
95      }
96      
97      protected CrawlServer createServerFor(String s) {
98          CrawlServer cserver = (CrawlServer)this.servers.get(s);
99          if (cserver != null) {
100             return cserver;
101         }
102         // Ensure key is private object
103         String skey = new String(s);
104         cserver = new CrawlServer(skey);
105         cserver.setSettingsHandler(settingsHandler);
106         servers.put(skey,cserver);
107         if (logger.isLoggable(Level.FINER)) {
108             logger.finer("Created server " + s);
109         }
110         return cserver;
111     }
112 
113     /***
114      * Get the {@link CrawlServer} associated with <code>curi</code>.
115      * @param cauri CandidateURI we're to get server from.
116      * @return CrawlServer instance that matches the passed CandidateURI.
117      */
118     public CrawlServer getServerFor(CandidateURI cauri) {
119         CrawlServer cs = null;
120         try {
121             String key = CrawlServer.getServerKey(cauri);
122             // TODOSOMEDAY: make this robust against those rare cases
123             // where authority is not a hostname.
124             if (key != null) {
125                 cs = getServerFor(key);
126             }
127         } catch (URIException e) {
128             logger.severe(e.getMessage() + ": " + cauri);
129             e.printStackTrace();
130         } catch (NullPointerException npe) {
131             logger.severe(npe.getMessage() + ": " + cauri);
132             npe.printStackTrace();
133         }
134         return cs;
135     }
136     
137     /***
138      * Get the {@link CrawlHost} associated with <code>name</code>.
139      * @param hostname Host name we're to return Host for.
140      * @return CrawlHost instance that matches the passed Host name.
141      */
142     public synchronized CrawlHost getHostFor(String hostname) {
143         if (hostname == null || hostname.length() == 0) {
144             return null;
145         }
146         CrawlHost host = (CrawlHost)this.hosts.get(hostname);
147         return (host != null)? host: createHostFor(hostname);
148     }
149     
150     protected CrawlHost createHostFor(String hostname) {
151         if (hostname == null || hostname.length() == 0) {
152             return null;
153         }
154         CrawlHost host = (CrawlHost)this.hosts.get(hostname);
155         if (host != null) {
156             return host;
157         }
158         String hkey = new String(hostname); 
159         host = new CrawlHost(hkey);
160         this.hosts.put(hkey, host);
161         if (logger.isLoggable(Level.FINE)) {
162             logger.fine("Created host " + hostname);
163         }
164         return host;
165     }
166     
167     /***
168      * Get the {@link CrawlHost} associated with <code>curi</code>.
169      * @param cauri CandidateURI we're to return Host for.
170      * @return CandidateURI instance that matches the passed Host name.
171      */
172     public CrawlHost getHostFor(CandidateURI cauri) {
173         CrawlHost h = null;
174         try {
175             h = getHostFor(cauri.getUURI().getReferencedHost());
176         } catch (URIException e) {
177             e.printStackTrace();
178         }
179         return h;
180     }
181 
182     /***
183      * @param serverKey Key to use doing lookup.
184      * @return True if a server instance exists.
185      */
186     public boolean containsServer(String serverKey) {
187         return (CrawlServer) servers.get(serverKey) != null; 
188     }
189 
190     /***
191      * @param hostKey Key to use doing lookup.
192      * @return True if a host instance exists.
193      */
194     public boolean containsHost(String hostKey) {
195         return (CrawlHost) hosts.get(hostKey) != null; 
196     }
197 
198     /***
199      * Called when shutting down the cache so we can do clean up.
200      */
201     public void cleanup() {
202         if (this.hosts != null) {
203             // If we're using a bdb bigmap, the call to clear will
204             // close down the bdb database.
205             this.hosts.clear();
206             this.hosts = null;
207         }
208         if (this.servers != null) { 
209             this.servers.clear();
210             this.servers = null;
211         }
212     }
213 
214     public void forAllHostsDo(Closure c) {
215         for(String host : hosts.keySet()) {
216             c.execute(hosts.get(host));
217         }
218     }
219 }