import java.util.Vector; import java.util.StringTokenizer; import java.net.URL; import java.net.MalformedURLException; import java.io.InputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; /** *

Netrand Project

*

Software Engineering - CS536

*

University of Wisconsin - Milwaukee

*

Authors:

* *
File: HTMLLinkExtractor.java
* Note: * This file was originally part of a Web Crawler program written by * Tim Macinta in 1997 that gathered links form the internet and formed * a web search database. The files containing the logic for crawling * across the internet were taken from this program and slightly modified * for the purpose of the NetRand project. *
* This LinkExtractor can extract URLs from HTML files. */ public class HTMLLinkExtractor implements LinkExtractor { Vector urls = new Vector(6, 9); // list of URLs int next_url = 0; // next URL to return int url_count = 0; // number of URLs URL base = null; // base URL /** Creates a new HTMLLinkExtractor that will enumerate all the * URLs in the give "cache_file". */ public HTMLLinkExtractor(File cache_file, URL base_url) throws IOException { this.base = base_url; InputStream in = new FileInputStream(cache_file); int state = 0; StringBuffer sb = new StringBuffer(); int i = in.read(); while (i >= 0) { switch (state) { case 0: if (i == '<') state = '<'; break; case '<': if (i == '>') { state = 0; analyze(sb.toString()); sb.setLength(0); } else { sb.append((char) i); } } i = in.read(); } if (sb.length() > 0) analyze(sb.toString()); in.close(); } /** Analyzes "param", which should be the contents between a '<' and a '>', * and adds any URLs that are found to the list of URLs. */ public void analyze(String param) { StringTokenizer st = new StringTokenizer(param); if (st.countTokens() < 2) return; String first_word = st.nextToken().toLowerCase(); if (first_word.equals("a")) { analyzeAnchor(st.nextToken("")); } else if (first_word.equals("frame")) { analyzeFrame(st.nextToken("")); } else if (first_word.equals("base")) { extractBase(st.nextToken("")); } } /** Analyzes the tag. */ void analyzeAnchor(String anchor) { String href = extract(anchor, "href"); if (href == null) return; try { addURL(new URL(base, href)); } catch (MalformedURLException e) { anchor = anchor.toLowerCase(); // java doesn't understand mailto and will throw an exception if (!href.startsWith("mailto:")) { e.printStackTrace(); } } } /** Analyzes the tag. */ void analyzeFrame(String frame) { String src = extract(frame, "src"); if (src == null) return; try { addURL(new URL(base, src)); } catch (MalformedURLException e) { e.printStackTrace(); } } /** Extracts the base URL from the tag. */ void extractBase(String b) { String b2 = extract(b, "href"); if (b2 != null) { try { base = new URL(base, b2); } catch (MalformedURLException e) { e.printStackTrace(); } } } /** Adds "url" to the list of URLs. */ public void addURL(URL url) { urls.addElement(url); url_count++; } public boolean hasMoreElements() { return url_count != next_url; } public Object nextElement() { Object ob = urls.elementAt(next_url); next_url++; return ob; } /** Resets this enumeration. */ public void reset() { next_url = 0; } /** Returns the value in "line" associated with "key", or null if "key" * is not found. For instance, if line were "a href="blah blah blah" * and "key" were "href" this method would return "blah blah blah". *

* Keys are case insensitive. */ String extract(String line, String key) { try { key = key.toLowerCase(); String lower_case = line.toLowerCase(); int i = lower_case.indexOf(key); if (i < 0) return null; i += key.length(); if (line.charAt(i) != '=') return null; i++; int i2; if (line.charAt(i) == '"') { i++; i2 = line.indexOf('"', i); if (i2 < 0) { return line.substring(i); } else { return line.substring(i, i2); } } else { int targ = line.length(); for (i2 = i; i < targ; i++) { if (Character.isSpace(line.charAt(i))) break; } return line.substring(i, i2); } } catch (StringIndexOutOfBoundsException e) {} return null; } }