import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
/**
*
Netrand Project
* Software Engineering - CS536
* University of Wisconsin - Milwaukee
* Authors:
*
* - Spring 1998 - Francis William Kasper
*
*
File: Indexer.java
* Note:
* This file was originally part of a Web Crawler program written by
* Tim Macinta in 1997 that gathered links form the internet and formed
* a web search database. The files containing the logic for crawling
* across the internet were taken from this program and slightly modified
* for the purpose of the NetRand project.
*
* The Indexer is a thread which can index URLs that have been
* cached using the URLStatus class. Use the queueURL() method
* to add cached URLs to the Indexer's list of URLs. Once the
* start() method is called, the Indexer will start processing
* URLs in its queue. More URLs can also be added after calling
* start, in fact this may be the best way to use the Indexer.
* Calling the stopWhenDone() method will cause the Indexer
* thread to stop as soon as its queue empties.
*/
public class Indexer extends Thread {
File working_dir; // a temporary working directory
FIFOQueue q = new FIFOQueue(); // queue for cached URLs
Object q_mutex = new Object(); // synchronization mutex
boolean running = false; // set to false when the Thread should stop
Crawler crawler; // the crawler that retrieves URLs
EnginePrefs prefs; // preferences
boolean exit_when_done = false;// exit when done indexing?
long total_bytes = 0; // total number of bytes indexed
/** "working_dir" should be a directory that only this
* Indexer and a given Cralwer will be
* accessing. This means that if several Indexers are running
* simultaneously, they should all be given different "working_dir"
* directories. Also, no other threads should write to this
* directory (except for the selected Crawler).
*/
public Indexer(File working_dir, Crawler crawler, EnginePrefs prefs) {
this.working_dir = working_dir;
this.crawler = crawler;
this.prefs = prefs;
cleanUp(); // remove all temporary files
}
/** Use this method to add a cached url to the Indexer. */
public void queueURL(URLStatus url) {
if (url == null) return;
synchronized (q_mutex) {
q.addElement(url);
}
}
/** Starts the Indexer. */
public void start() {
running = true;
super.start();
}
/** This is where the actual indexing is done. */
public void run() {
URLStatus url;
while (q.hasMoreElements() || running) {
if (q.hasMoreElements()) {
synchronized (q_mutex) {
url = (URLStatus) q.readNextElement();
}
try {
total_bytes += url.getContentLength();
addNewURLs(url.getLinkExtractor());
SpiderEntropy.MineEntropy(url.temp_file);
synchronized (q_mutex) {
q.nextElement();
}
} catch (IOException e) {
e.printStackTrace();
}
} else {
// Nothing in queue so sleep for a few seconds
try {
Thread.sleep(5000);
} catch (InterruptedException e) {}
}
}
cleanUp();
System.gc();
if (exit_when_done) {
System.exit(0);
}
}
/** Causes this Indexer to stop whenever it finishes indexing the URLs
* in its queue. */
public void stopWhenDone(boolean exit_when_done) {
running = false;
this.exit_when_done = exit_when_done;
}
/** Removes all ".tmp" files in the directory "working_dir". */
void cleanUp() {
String[] files = working_dir.list();
for (int i = 0; i < files.length; i++) {
if ( files[i].endsWith(".tmp") ) {
new File(working_dir, files[i]).delete();
}
}
}
/** Adds new URLs to the crawler's queue. */
void addNewURLs(LinkExtractor urls) {
while (urls.hasMoreElements()) {
crawler.addURL((URL) urls.nextElement());
}
}
}