import java.net.*; import java.io.*; /** *

NetRand Project

*

Software Engineering - CS536

*

University of Wisconsin - Milwaukee

*

Authors:

* *
File: BinaryLinkExtractor.java *
*

* BinaryLinkExtractor extracts links to binary files from HTML files. */ public class BinaryLinkExtractor { // --------- Class Variables ----------- // /** the url of the HTML to parse */ private URL url; /** handles the URL connection */ private URLConnection urlConn; /** the name of the directory to store the cache file */ private String tmp_dir = "./"; /** the cache file */ private File cacheFile; // ---------- Class Methods ----------- // /** * constructor for the BinaryLinkExtractor */ public BinaryLinkExtractor() { url = null; urlConn = null; cacheFile = null; } // ------------------------------------- // /** * calls the appropriate functions to extract the binary file links from * given URL. * @param urlString - the target URL * @return a LinkedList containing the URLs to the found binary files */ public LinkedList extractBinaryLinks( String urlString ) { String baseURL = null; String binaryFile = null; String tag = null; FileInputStream theHTML = null; LinkedList linkList = new LinkedList(); try { baseURL = cacheHTMLDocument( urlString ); baseURL = baseURL.substring(0, baseURL.lastIndexOf('/')+1); theHTML = new FileInputStream( cacheFile ); while ( (tag = readTag(theHTML)) != null ) { if ( isBaseTag(tag) ) { baseURL = extract(tag, "href"); } if ( isHypertextLink(tag) ) { binaryFile = extract(tag, "href"); if ( binaryFile.indexOf("http") < 0 ) binaryFile = baseURL.concat(binaryFile); if ( isBinaryFile(binaryFile) ) linkList.append( new String(binaryFile) ); } if ( isImageSourceTag(tag) ) { binaryFile = extract(tag, "src"); if ( binaryFile.indexOf("http") < 0 ) binaryFile = baseURL.concat(binaryFile); if ( isBinaryFile(binaryFile) ) linkList.append( new String(binaryFile) ); } } theHTML.close(); deleteCacheDocument(); } catch ( Exception e ) { System.err.println( e ); return null; } if ( linkList.size() > 0 ) return linkList; else return null; } // ------------------------------------ // /** * gets the HTMl file from the remote serve and writes it to a local cache * file * @param urlString - the URL of the target HTML file * @exception IOException if there was an error reading the HTML file */ private String cacheHTMLDocument( String urlString ) throws IOException { String thisLine = null; String urlName = null; if ( cacheFile != null ) deleteCacheDocument(); try { url = new URL( urlString ); urlConn = url.openConnection(); cacheFile = new File( tmp_dir + "html.cache" ); BufferedReader in = new BufferedReader( new InputStreamReader(urlConn.getInputStream()) ); PrintWriter out = new PrintWriter( new FileOutputStream(cacheFile), true ); urlName = new String(urlConn.toString()); urlName = urlName.substring(urlName.lastIndexOf("http://")); while ( (thisLine = in.readLine()) != null ) out.println(thisLine); in.close(); out.close(); } catch ( MalformedURLException me ) { System.err.println( me ); deleteCacheDocument(); throw new IOException( urlString + ": is not a URL I understand" ); } catch ( Exception e ) { System.err.println( e ); deleteCacheDocument(); throw new IOException( urlName + ":cannot cache HTML file" ); } return urlName; } // --------------------------------------- // /** * deletes the cache file */ private void deleteCacheDocument() { if ( cacheFile != null ) { cacheFile.delete(); cacheFile = null; } } // --------------------------------------- // /** * parses for a HTML tag in a HTML file * @param theHTML - the HTML file's input stream * @return a String containing the HTML tag, or null if no tag was found */ private String readTag( FileInputStream theHTML ) { int c; StringBuffer tagBuf = new StringBuffer(); try { while ( ((c = theHTML.read()) != -1) && ((char)c != '<') ); if ( (char)c == '<' ) { while( ((c = theHTML.read()) != -1) && ((char)c != '>') ) tagBuf.append((char)c); } } catch ( IOException e ) { System.err.println( e ); } if ( tagBuf.length() > 0 ) return ( new String(tagBuf.toString()) ); else return null; } // --------------------------------------- // /** * extracts the text within quotes from a line immediately after a given key * @param line - the line to extract the test from * @param key - the key to search for * @return a String containing the extracted text */ private String extract( String line, String key ) { try { key = key.toLowerCase(); String lower_case = line.toLowerCase(); int i = lower_case.indexOf(key); if ( i < 0 ) return null; i += key.length(); if ( line.charAt(i) != '=') return null; i++; int i2; if ( line.charAt(i) == '"' ) { i++; i2 = line.indexOf('"', i); if ( i2 < 0 ) return line.substring(i); else return line.substring(i, i2); } else { int targ = line.length(); for ( i2 = i; i2 < targ; i2++ ) if ( Character.isSpaceChar(line.charAt(i2)) ) break; return line.substring(i, i2); } } catch ( StringIndexOutOfBoundsException e ) {} return null; } // ------------------------------------- // /** * @param tag - the HTML tag to test * @return true iff the tag represents the HTML BASE tag, otherwise false */ private boolean isBaseTag( String tag ) { if ( (tag.indexOf("BASE") != -1 || tag.indexOf("base") != -1) && (tag.indexOf("HREF") != -1 || tag.indexOf("href") != -1) ) return true; else return false; } // ------------------------------------- // /** * @param tag - the HTML tag to test * @return true iff the HTML tag contains a hypertext link, otherwise false */ private boolean isHypertextLink( String tag ) { if ( tag.indexOf("HREF") != -1 || tag.indexOf("href") != -1 ) return true; else return false; } // -------------------------------------- // /** * @param tag - the HTML tag to test * @return true iff the HTML tag is an IMG SRC tag, otherwise false */ private boolean isImageSourceTag( String tag ) { if ( (tag.indexOf("IMG") != -1 || tag.indexOf("img") != -1) && (tag.indexOf("SRC") != -1 || tag.indexOf("src") != -1) ) return true; else return false; } // -------------------------------------- // /** * @param link - the hypertext link to test * @return true iff the link references a binary file, otherwise false */ private boolean isBinaryFile( String link ) { if ( link.endsWith(".au") || link.endsWith(".wav") || link.endsWith(".aiff") || link.endsWith(".aifc") || link.endsWith(".mpg") || link.endsWith(".mpeg") || link.endsWith(".voc") || link.endsWith(".ra") || link.endsWith(".ram") || link.endsWith(".gif") || link.endsWith(".jpg") || link.endsWith(".jpeg") || link.endsWith(".tif") || link.endsWith(".tiff") || link.endsWith(".bmp") || link.endsWith(".mov") || link.endsWith(".moov") || link.endsWith(".qt") ) return true; return false; } }