import java.io.File; import java.io.InputStream; import java.io.OutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.BufferedOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.net.URL; /** *

Netrand Project

*

Software Engineering - CS536

*

University of Wisconsin - Milwaukee

*

Authors:

* *
File: Indexer.java
* Note: * This file was originally part of a Web Crawler program written by * Tim Macinta in 1997 that gathered links form the internet and formed * a web search database. The files containing the logic for crawling * across the internet were taken from this program and slightly modified * for the purpose of the NetRand project. *
* The Indexer is a thread which can index URLs that have been * cached using the URLStatus class. Use the queueURL() method * to add cached URLs to the Indexer's list of URLs. Once the * start() method is called, the Indexer will start processing * URLs in its queue. More URLs can also be added after calling * start, in fact this may be the best way to use the Indexer. * Calling the stopWhenDone() method will cause the Indexer * thread to stop as soon as its queue empties. */ public class Indexer extends Thread { File working_dir; // a temporary working directory FIFOQueue q = new FIFOQueue(); // queue for cached URLs Object q_mutex = new Object(); // synchronization mutex boolean running = false; // set to false when the Thread should stop Crawler crawler; // the crawler that retrieves URLs EnginePrefs prefs; // preferences boolean exit_when_done = false;// exit when done indexing? long total_bytes = 0; // total number of bytes indexed /** "working_dir" should be a directory that only this * Indexer and a given Cralwer will be * accessing. This means that if several Indexers are running * simultaneously, they should all be given different "working_dir" * directories. Also, no other threads should write to this * directory (except for the selected Crawler). */ public Indexer(File working_dir, Crawler crawler, EnginePrefs prefs) { this.working_dir = working_dir; this.crawler = crawler; this.prefs = prefs; cleanUp(); // remove all temporary files } /** Use this method to add a cached url to the Indexer. */ public void queueURL(URLStatus url) { if (url == null) return; synchronized (q_mutex) { q.addElement(url); } } /** Starts the Indexer. */ public void start() { running = true; super.start(); } /** This is where the actual indexing is done. */ public void run() { URLStatus url; while (q.hasMoreElements() || running) { if (q.hasMoreElements()) { synchronized (q_mutex) { url = (URLStatus) q.readNextElement(); } try { total_bytes += url.getContentLength(); addNewURLs(url.getLinkExtractor()); SpiderEntropy.MineEntropy(url.temp_file); synchronized (q_mutex) { q.nextElement(); } } catch (IOException e) { e.printStackTrace(); } } else { // Nothing in queue so sleep for a few seconds try { Thread.sleep(5000); } catch (InterruptedException e) {} } } cleanUp(); System.gc(); if (exit_when_done) { System.exit(0); } } /** Causes this Indexer to stop whenever it finishes indexing the URLs * in its queue. */ public void stopWhenDone(boolean exit_when_done) { running = false; this.exit_when_done = exit_when_done; } /** Removes all ".tmp" files in the directory "working_dir". */ void cleanUp() { String[] files = working_dir.list(); for (int i = 0; i < files.length; i++) { if ( files[i].endsWith(".tmp") ) { new File(working_dir, files[i]).delete(); } } } /** Adds new URLs to the crawler's queue. */ void addNewURLs(LinkExtractor urls) { while (urls.hasMoreElements()) { crawler.addURL((URL) urls.nextElement()); } } }