import java.net.URL;
import java.net.Socket;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.BufferedOutputStream;
import java.util.Vector;
import java.util.Enumeration;
/**
*
Netrand Project
* Software Engineering - CS536
* University of Wisconsin - Milwaukee
* Authors:
*
* - Spring 1998 - Francis William Kasper
*
*
File: URLStatus.java
* Note:
* This file was originally part of a Web Crawler program written by
* Tim Macinta in 1997 that gathered links form the internet and formed
* a web search database. The files containing the logic for crawling
* across the internet were taken from this program and slightly modified
* for the purpose of the NetRand project.
*
* This class holds information about the content at a particular URL.
* It can also be used to fetch and parse an URL.
*/
public class URLStatus {
URL given_url; // The given URL
URL actual_url = null; // The actual URL (differs in case of redirection)
File temp_file; // temporary file containing the contents of "url"
EnginePrefs eng_prefs; // engine preferences
String mime_type = ""; // mime type
static final int LOADED = 0;
static final int NOT_LOADED = 1;
static final int MOVED = 2;
static final int DUPLICATE = 4;
static final int MISSING = 8;
static final int TIMED_OUT = 16;
static final int IO_ERROR = 32;
static final int UNSUPPORTED_MIMETYPE = 64;
static final int MISC_ERROR = 128;
int status = NOT_LOADED; // gives the status of this URL
/** "url" is the location of the information and "temp_file" is the
* temporary file that can be used to store the contents of this
* url.
*/
public URLStatus(URL url, File temp_file, EnginePrefs eng_prefs) {
this.given_url = url;
this.actual_url = url;
this.temp_file = temp_file;
this.eng_prefs = eng_prefs;
}
/** Returns true if and only if this URL was loaded without an error. */
public boolean loaded() {
return ((Integer.MAX_VALUE ^ LOADED) & status) == 0;
}
/** Returns a LinkExtractor that can handle this URL's mime type.
* To add support for new mime types add a LinkExtractor that handles
* those mime types here and add appropriate WordExtractors to the
* getWordExtractor() method. Also, add the mime type to the list in
* the mimeTypeUnderstood() method.
*/
public LinkExtractor getLinkExtractor() throws IOException {
if (mime_type.equals("text/html")) {
return new HTMLLinkExtractor(temp_file, actual_url);
} else if (mime_type.equals("text/plain")) {
return new NullLinkExtractor();
}
return new NullLinkExtractor();
}
/** Returns true if and only if this mime type can be processed. */
public boolean mimeTypeUnderstood(String mime_type) {
mime_type = mime_type.toLowerCase();
// Add new mime types inside the conditional part of the 'if' statement.
if (mime_type.equals("text/html") ||
mime_type.equals("text/plain")) {
return true;
}
return false;
}
/** Returns the file that is used to cache the contents of this URL. */
public File getCacheFile() {
return temp_file;
}
/** Downloads the content of the given URL and stores it in a temporary
* cache file. */
public void readContent() {
String proto = given_url.getProtocol().toLowerCase();
try {
if (proto.equals("http")) {
readHTTP();
} else {
readGeneric();
}
} catch (IOException e) {
status |= IO_ERROR;
}
}
/** Downloads a file using the HTTP protocol. It was necessary to
* write a method to do this from scratch rather than using the default
* method in Java because:
*
* - There is no means for specifying the user agent
* using the default method.
*
- There is a bug in Java 1.0 implementation that makes
* it incompatible with HTTP version 1.1.
*
- Redirects are automatically followed (at least in
* Java 1.0) without providing a way to determine
* whether a redirect has occured.
*
*/
void readHTTP() throws IOException {
int port = given_url.getPort();
if (port < 0) port = 80;
Socket sock = new Socket(given_url.getHost(), port);
PushbackInputStream pbin = new PushbackInputStream(sock.getInputStream());
DataInputStream in = new DataInputStream(pbin);
DataOutputStream out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream()));
OutputStream cache = new BufferedOutputStream(new FileOutputStream(temp_file));
out.writeBytes("GET "+given_url.getFile()+" HTTP/1.0\n");
out.writeBytes("User-Agent: "+ eng_prefs.user_agent + "\n");
out.writeBytes("From: "+ eng_prefs.email_address + "\n");
out.writeBytes("Host: "+given_url.getHost()+"\n");
out.writeBytes("\n");
out.flush();
String line = readLine(pbin);
try {
// try reading full response
if (!line.toLowerCase().startsWith("http/")) {
throw new Exception();
}
int url_status = line.charAt(line.indexOf(' ')+1);
switch(url_status) {
case '2': // request successful
String line2;
try {
while (true) {
line2 = in.readLine().toLowerCase();
if (line2 == null) break;
if (line2.startsWith("content-type:")) {
if (line2.charAt(13) == ' ') {
mime_type = line2.substring(14);
} else {
mime_type = line2.substring(13);
}
if (!mimeTypeUnderstood(mime_type)) {
status |= UNSUPPORTED_MIMETYPE;
in.close();
out.close();
sock.close();
cache.close();
return;
}
} else if (line2.equals("")) {
break;
}
}
pipe(in, cache);
status = LOADED;
} catch (StringIndexOutOfBoundsException e) {}
break;
case '3': // page has moved
status = MOVED;
line = in.readLine();
while (!line.toLowerCase().startsWith("location:")) line = in.readLine();
if (line.charAt(9) == ' ') {
actual_url = new URL(line.substring(10).trim());
} else {
actual_url = new URL(line.substring(9).trim());
}
break;
case '4':
status |= MISSING;
break;
default:
status |= MISC_ERROR;
break;
}
} catch (Exception e) {
// full response failed, now try simple response
if (line != null) new DataOutputStream(cache).writeBytes(line);
pipe(in, cache);
}
in.close();
out.close();
sock.close();
cache.close();
}
/** A replacement for the java.io.DataInputStream which doesn't return
* the line ending characters like it should.
*/
String readLine(PushbackInputStream in) throws IOException {
StringBuffer sb = new StringBuffer();
int c;
while (true) {
c = in.read();
switch (c) {
case '\n':
sb.append('\n');
return new String(sb);
case '\r':
sb.append('\r');
c = in.read();
if (c == '\n') {
sb.append('\n');
return new String(sb);
} else {
in.unread((char) c);
return new String(sb);
}
case -1:
if (sb.length() < 1) return null;
return new String(sb);
default:
sb.append((char) c);
}
}
}
/** This method provides a fallback to the default Java implementation
* for protocols which have not been re-implemented.
*/
void readGeneric() throws IOException {
// guess at mime type
String url = given_url.toString().toLowerCase();
if (url.endsWith(".html") || url.endsWith(".htm")) {
mime_type = "text/html";
} else if (url.endsWith(".txt")) {
mime_type = "text/plain";
}
// fetch URL
InputStream in = given_url.openStream();
OutputStream out = new FileOutputStream(temp_file);
pipe(in, out);
out.close();
status = LOADED;
}
/** Gets rid of the temporary file.
* @exception Throwable is thrown
*/
public void finalize() throws Throwable {
temp_file.delete();
super.finalize();
}
/** Pipes "in" to "out" until "in" is exhausted then closes "in". */
void pipe(InputStream in, OutputStream out) throws IOException {
byte[] b = new byte[512];
int x = in.read(b, 0, b.length);
while (x > 0) {
out.write(b, 0, x);
x = in.read(b, 0, b.length);
}
in.close();
}
/** Returns true if and only if this URL causes a redirection. */
public boolean moved() {
return (status & MOVED) != 0;
}
/** Returns the length of the content, or 0 if it's unknown. */
public long getContentLength() {
return temp_file.length();
}
}