import java.net.URL; import java.net.Socket; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PushbackInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.FileOutputStream; import java.io.BufferedOutputStream; import java.util.Vector; import java.util.Enumeration; /** *

Netrand Project

*

Software Engineering - CS536

*

University of Wisconsin - Milwaukee

*

Authors:

* *
File: URLStatus.java
* Note: * This file was originally part of a Web Crawler program written by * Tim Macinta in 1997 that gathered links form the internet and formed * a web search database. The files containing the logic for crawling * across the internet were taken from this program and slightly modified * for the purpose of the NetRand project. *
* This class holds information about the content at a particular URL. * It can also be used to fetch and parse an URL. */ public class URLStatus { URL given_url; // The given URL URL actual_url = null; // The actual URL (differs in case of redirection) File temp_file; // temporary file containing the contents of "url" EnginePrefs eng_prefs; // engine preferences String mime_type = ""; // mime type static final int LOADED = 0; static final int NOT_LOADED = 1; static final int MOVED = 2; static final int DUPLICATE = 4; static final int MISSING = 8; static final int TIMED_OUT = 16; static final int IO_ERROR = 32; static final int UNSUPPORTED_MIMETYPE = 64; static final int MISC_ERROR = 128; int status = NOT_LOADED; // gives the status of this URL /** "url" is the location of the information and "temp_file" is the * temporary file that can be used to store the contents of this * url. */ public URLStatus(URL url, File temp_file, EnginePrefs eng_prefs) { this.given_url = url; this.actual_url = url; this.temp_file = temp_file; this.eng_prefs = eng_prefs; } /** Returns true if and only if this URL was loaded without an error. */ public boolean loaded() { return ((Integer.MAX_VALUE ^ LOADED) & status) == 0; } /** Returns a LinkExtractor that can handle this URL's mime type. * To add support for new mime types add a LinkExtractor that handles * those mime types here and add appropriate WordExtractors to the * getWordExtractor() method. Also, add the mime type to the list in * the mimeTypeUnderstood() method. */ public LinkExtractor getLinkExtractor() throws IOException { if (mime_type.equals("text/html")) { return new HTMLLinkExtractor(temp_file, actual_url); } else if (mime_type.equals("text/plain")) { return new NullLinkExtractor(); } return new NullLinkExtractor(); } /** Returns true if and only if this mime type can be processed. */ public boolean mimeTypeUnderstood(String mime_type) { mime_type = mime_type.toLowerCase(); // Add new mime types inside the conditional part of the 'if' statement. if (mime_type.equals("text/html") || mime_type.equals("text/plain")) { return true; } return false; } /** Returns the file that is used to cache the contents of this URL. */ public File getCacheFile() { return temp_file; } /** Downloads the content of the given URL and stores it in a temporary * cache file. */ public void readContent() { String proto = given_url.getProtocol().toLowerCase(); try { if (proto.equals("http")) { readHTTP(); } else { readGeneric(); } } catch (IOException e) { status |= IO_ERROR; } } /** Downloads a file using the HTTP protocol. It was necessary to * write a method to do this from scratch rather than using the default * method in Java because: *

*/ void readHTTP() throws IOException { int port = given_url.getPort(); if (port < 0) port = 80; Socket sock = new Socket(given_url.getHost(), port); PushbackInputStream pbin = new PushbackInputStream(sock.getInputStream()); DataInputStream in = new DataInputStream(pbin); DataOutputStream out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream())); OutputStream cache = new BufferedOutputStream(new FileOutputStream(temp_file)); out.writeBytes("GET "+given_url.getFile()+" HTTP/1.0\n"); out.writeBytes("User-Agent: "+ eng_prefs.user_agent + "\n"); out.writeBytes("From: "+ eng_prefs.email_address + "\n"); out.writeBytes("Host: "+given_url.getHost()+"\n"); out.writeBytes("\n"); out.flush(); String line = readLine(pbin); try { // try reading full response if (!line.toLowerCase().startsWith("http/")) { throw new Exception(); } int url_status = line.charAt(line.indexOf(' ')+1); switch(url_status) { case '2': // request successful String line2; try { while (true) { line2 = in.readLine().toLowerCase(); if (line2 == null) break; if (line2.startsWith("content-type:")) { if (line2.charAt(13) == ' ') { mime_type = line2.substring(14); } else { mime_type = line2.substring(13); } if (!mimeTypeUnderstood(mime_type)) { status |= UNSUPPORTED_MIMETYPE; in.close(); out.close(); sock.close(); cache.close(); return; } } else if (line2.equals("")) { break; } } pipe(in, cache); status = LOADED; } catch (StringIndexOutOfBoundsException e) {} break; case '3': // page has moved status = MOVED; line = in.readLine(); while (!line.toLowerCase().startsWith("location:")) line = in.readLine(); if (line.charAt(9) == ' ') { actual_url = new URL(line.substring(10).trim()); } else { actual_url = new URL(line.substring(9).trim()); } break; case '4': status |= MISSING; break; default: status |= MISC_ERROR; break; } } catch (Exception e) { // full response failed, now try simple response if (line != null) new DataOutputStream(cache).writeBytes(line); pipe(in, cache); } in.close(); out.close(); sock.close(); cache.close(); } /** A replacement for the java.io.DataInputStream which doesn't return * the line ending characters like it should. */ String readLine(PushbackInputStream in) throws IOException { StringBuffer sb = new StringBuffer(); int c; while (true) { c = in.read(); switch (c) { case '\n': sb.append('\n'); return new String(sb); case '\r': sb.append('\r'); c = in.read(); if (c == '\n') { sb.append('\n'); return new String(sb); } else { in.unread((char) c); return new String(sb); } case -1: if (sb.length() < 1) return null; return new String(sb); default: sb.append((char) c); } } } /** This method provides a fallback to the default Java implementation * for protocols which have not been re-implemented. */ void readGeneric() throws IOException { // guess at mime type String url = given_url.toString().toLowerCase(); if (url.endsWith(".html") || url.endsWith(".htm")) { mime_type = "text/html"; } else if (url.endsWith(".txt")) { mime_type = "text/plain"; } // fetch URL InputStream in = given_url.openStream(); OutputStream out = new FileOutputStream(temp_file); pipe(in, out); out.close(); status = LOADED; } /** Gets rid of the temporary file. * @exception Throwable is thrown */ public void finalize() throws Throwable { temp_file.delete(); super.finalize(); } /** Pipes "in" to "out" until "in" is exhausted then closes "in". */ void pipe(InputStream in, OutputStream out) throws IOException { byte[] b = new byte[512]; int x = in.read(b, 0, b.length); while (x > 0) { out.write(b, 0, x); x = in.read(b, 0, b.length); } in.close(); } /** Returns true if and only if this URL causes a redirection. */ public boolean moved() { return (status & MOVED) != 0; } /** Returns the length of the content, or 0 if it's unknown. */ public long getContentLength() { return temp_file.length(); } }