import java.net.URL; import java.net.MalformedURLException; import java.io.File; import java.util.Vector; import java.util.Hashtable; import java.util.Enumeration; import java.io.IOException; import java.io.DataInputStream; import java.io.FileInputStream; /** *

Netrand Project

*

Software Engineering - CS536

*

University of Wisconsin - Milwaukee

*

Authors:

* *
File: EnginePrefs.java
* Note: * This file was originally part of a Web Crawler program written by * Tim Macinta in 1997 that gathered links form the internet and formed * a web search database. The files containing the logic for crawling * across the internet were taken from this program and slightly modified * for the purpose of the NetRand project. *
* Encapsulates the preferences for the crawler and the search * engine. */ public class EnginePrefs { /** The time to pause between URL fetches (in seconds). */ public int pause_time = 5; public int maxCacheFiles = 10; File main_dir = new File("searchdb"); // directory containing main index // // and custom html files File rules = new File(main_dir, "rules.txt"); // inclusion/exclusion rules File url_list = new File(main_dir, "urls.txt"); // list of starting URLs File working_dir = new File("searchdb"); // temporary working directory Vector excluded = new Vector(3, 10); // excluded URLs Vector included = new Vector(3, 10); // included URLs Hashtable hosts = new Hashtable(3, 6); // hosts where we've read robots.txt String user_agent = "BDDBot"; // name used when retrieving URLs String email_address = "nobody@nowhere.edu"; // administrator's email address boolean filter_cgi = true; // filter out cgi urls? public static int port = 8001; // default web server port public EnginePrefs() { if (!main_dir.exists()) main_dir.mkdir(); if (!working_dir.exists()) working_dir.mkdir(); try { readRulesFile(); } catch (IOException e) { e.printStackTrace(); } } /** Returns true if "url" is allowed to be indexed and false otherwise. */ public boolean URLAllowed(URL url) { URL u2; String protocol = url.getProtocol(); String host = url.getHost(); int port = url.getPort(); if (port < 0 && protocol.equals("http")) port = 80; String file = url.getFile(); // filter out cgi scripts if (filter_cgi) { if (file.indexOf('?') > -1) return false; if (file.startsWith("/cgi-bin/")) return false; } // check exclusion rules int p; Enumeration en = excluded.elements(); while (en.hasMoreElements()) { u2 = (URL) en.nextElement(); p = u2.getPort(); if (protocol.equals(u2.getProtocol()) && host.equals(u2.getHost()) && (port == p || (port == 80 && p == -1)) && file.startsWith(u2.getFile())) return false; } // include all files that aren't excluded if (protocol.equals("file")) return true; // check inclusion rules en = included.elements(); while (en.hasMoreElements()) { u2 = (URL) en.nextElement(); p = u2.getPort(); if (protocol.equals(u2.getProtocol()) && host.equals(u2.getHost()) && (port == p || (port == 80 && p == -1)) && file.startsWith(u2.getFile())) { if (protocol.equals("http") && hosts.get(host+":"+port) == null) { readRobotsDotText(host, port); hosts.put(host+":"+port, Boolean.TRUE); return URLAllowed(url); } else { return true; } } } return true; } /** Pauses for the amount of time that has been specified for pausing * between URL fetches. */ public void pauseBetweenURLs() { long diff = pause_time * -1000L; long start = System.currentTimeMillis(); while (diff < 0) { try { Thread.sleep(-diff); } catch (InterruptedException e) {} diff = System.currentTimeMillis() - start - pause_time * -1000L; } } public File getMainDir() { return main_dir; } /** Returns the working directory for use by a crawler. If more than * one crawler is running at the same time they should be given different * working directories. */ public File getWorkingDir() { return working_dir; } public File getStartingFile() { return url_list; } /** The rules file contains rules which determine what URLs are allowed * and what URLs whould be excluded. A line that is in the form: *
   *  include http://gsd.mit.edu/
   *  
* will cause all URLs that start with "http://gsd.mit.edu/" to be * included. Similarly, to exclude URLs, use the keyword "exclude" * instead of "include". Blank lines and lines starting with "#" are * ignored. *

* When an URL is checked against the inclusion/exclusion rules the * exclusion rules are checked first and if the URL matches an * exclusion rule it is not included. If an URL is not covered by * either rule it is not included, unless it is a "file://" URL in * which case it is included by default. */ public File getRulesFile() { return rules; } /** Causes the inclusion/exclusion rules to be read. This method should * be called if the rules file is changed. */ public void readRulesFile() throws IOException { excluded.removeAllElements(); included.removeAllElements(); hosts.clear(); DataInputStream in = new DataInputStream(new FileInputStream(rules)); String line = in.readLine(); while (line != null) { line = line.trim(); try { if (line.startsWith("include ")) { included.addElement(new URL(line.substring(8))); } else if (line.startsWith("exclude ")) { excluded.addElement(new URL(line.substring(8))); } } catch (MalformedURLException e) { e.printStackTrace(); } line = in.readLine(); } in.close(); } /** Reads the "robots.txt" file on the given host and uses the results * to determine what files on "host" are crawlable. */ public void readRobotsDotText(String host, int port) { try { if (port < 0) port = 80; URL url = new URL("http", host, port, "/robots.txt"); DataInputStream in = new DataInputStream(url.openStream()); boolean relevant = false; String line = in.readLine(); String lower_case; String us = user_agent.toLowerCase(); while (line != null) { line = line.trim(); lower_case = line.toLowerCase(); if (lower_case.startsWith("user-agent:")) { // determine if the following directives apply to us int i = 11; while (Character.isSpace(line.charAt(i))) i++; int i2 = lower_case.indexOf(' ', i); if (i2 < 0) i2 = lower_case.length(); lower_case = lower_case.substring(i, i2); if (lower_case.endsWith("*")) { lower_case = lower_case.substring(0, lower_case.length()-1); } relevant = us.startsWith(lower_case); } else if (lower_case.startsWith("disallow:")) { // assimilate directive if applicable if (relevant) { int i = 9; while (Character.isSpace(line.charAt(i))) i++; int i2 = line.indexOf(' ', i); if (i2 < 0) i2 = line.length(); line = line.substring(i, i2); excluded.addElement(new URL("http", host, port, line)); } } line = in.readLine(); } in.close(); } catch (IOException e) { } catch (StringIndexOutOfBoundsException e2) { } } /** Returns true if this URL represents a file type that is not indexable. */ public boolean URLNotIndexable(URL url) { String f = url.getFile().toLowerCase(); return (f.endsWith(".gif") || f.endsWith(".tif") || f.endsWith(".map") || f.endsWith(".jpg") || f.endsWith(".ppt") || f.endsWith(".doc") || f.endsWith(".pdf") || f.endsWith(".xls") || f.endsWith(".rtf")); } }