/* * Created on Jun 25, 2005 * */ package com.ai.lucene; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.net.MalformedURLException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.demo.html.HTMLParser; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TermQuery; /** * @author satya * * TODO To change the template for this generated type comment go to * Window - Preferences - Java - Code Style - Code Templates */ public class LuceneIndex { //Index directory private String m_directory = null; //Analyzer for this index private Analyzer m_analyzer = new StandardAnalyzer(); public LuceneIndex(String path, Analyzer inAnalyzer) { m_directory = path; m_analyzer = inAnalyzer; } /** * Get a lucene index writer for the given index * Close the writer once you are done * -- Not sure if createWriter is a better name for this method -- * @return IndexWriter * @throws IOException */ public IndexWriter getWriter() throws IOException { //Open an index for writing //Directory path //Don't create the index IndexWriter m_indexWriter = new IndexWriter( m_directory, m_analyzer, false); return m_indexWriter; } /** * get an index reader for this lucene index at this directory * You can not instantiate an indexreader directly * @return * @throws IOException */ public IndexReader getReader() throws IOException { return IndexReader.open(m_directory); } public Searcher getIndexSearcher() throws IOException { return new IndexSearcher(getReader()); } /** * Creates an index if it doesn't exist. * If it exists, no action is taken. * * Useful for initialization code. */ public void createIndex() throws IOException { //create an index before any operation on this index takes place if (IndexReader.indexExists(m_directory) == true) { Log.log("Index already exists"); return; } //Index is not there create it IndexWriter m_indexWriter = new IndexWriter( m_directory, m_analyzer, true); m_indexWriter.close(); Log.log("Index created successfully"); }//eof-method /** * Erase the current index and create a new one * */ public void recreateIndex() throws IOException { IndexWriter m_indexWriter = new IndexWriter( m_directory, m_analyzer, true); m_indexWriter.close(); } public void deleteIndex() throws IOException { recreateIndex(); } /** * Index the contents of a url into this index * @param inUrl * @throws MalformedURLException * @throws IOException */ public void indexAURL(String inUrl) throws MalformedURLException, IOException { InputStream is = null; Reader htmlReader = null; IndexWriter writer = null; try { //Open the url java.net.URL url = new java.net.URL(inUrl); is = url.openStream(); //Parse the content HTMLParser parser = new HTMLParser(is); htmlReader = parser.getReader(); //Create a document Document doc = new Document(); //Add a field to the document doc.add(Field.Text("content",htmlReader)); LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); writer = li.getWriter(); writer.addDocument(doc); writer.optimize(); //writer.close(); } finally { if (writer != null) writer.close(); if (htmlReader != null) htmlReader.close(); if (is != null) is.close(); } }//eof-functin /** * Index the contents of a url into this index. Use the url as a unique identifier * If the url already exists don't index it * @param inUrl * @throws MalformedURLException * @throws IOException */ public void indexAURLWithIdentity(String inUrl) throws MalformedURLException, IOException { Document prevDoc = this.locateIndexedUrl(inUrl); if (prevDoc != null) { //Document already exists return; } alwaysIndexAURLWithIdentity(inUrl); }//eof-functin public void reIndexAURLWithIdentity(String inUrl) throws IOException { Document prevDoc = this.locateIndexedUrl(inUrl); if (prevDoc != null) { //Document already exists //delete the document; this.deleteAURLIndex(inUrl); } // no document at this time alwaysIndexAURLWithIdentity(inUrl); } public void alwaysIndexAURLWithIdentity(String inUrl) throws MalformedURLException, IOException { InputStream is = null; Reader htmlReader = null; IndexWriter writer = null; try { //Open the url java.net.URL url = new java.net.URL(inUrl); is = url.openStream(); //Parse the content HTMLParser parser = new HTMLParser(is); htmlReader = parser.getReader(); //Create a document Document doc = new Document(); //Add a field to the document doc.add(Field.Text("content",htmlReader,true)); doc.add(Field.Keyword("docid",inUrl)); doc.add(Field.UnIndexed("url",inUrl)); writer = getWriter(); writer.addDocument(doc); writer.optimize(); //writer.close(); } finally { if (writer != null) writer.close(); if (htmlReader != null) htmlReader.close(); if (is != null) is.close(); } }//eof-functin /** * Retrieve a document based on an earlier indexed url * @param inUrl * @return */ public Document locateIndexedUrl(String inUrl) throws IOException { Searcher s = null; try { Term term = new Term("docid",inUrl); TermQuery termQuery = new TermQuery(term); s = getIndexSearcher(); Hits hits = s.search(termQuery); if (hits.length() == 0) { return null; } if (hits.length() > 1) { throw new RuntimeException("Too many hits."); } return hits.doc(0); } finally { if (s != null) s.close(); } }//eof-function public int locateIndexedUrlDocNumber(String inUrl) throws IOException { Searcher s = null; try { Term term = new Term("docid",inUrl); TermQuery termQuery = new TermQuery(term); s = getIndexSearcher(); Hits hits = s.search(termQuery); if (hits.length() == 0) { throw new RuntimeException("Document not found"); } if (hits.length() > 1) { throw new RuntimeException("Too many hits."); } return hits.id(0); } finally { if (s != null) s.close(); } }//eof-function public void deleteAURLIndex(String inUrl) throws IOException { IndexReader reader = getReader(); Term term = new Term("docid",inUrl); reader.delete(term); } }//eof-class
Satya - Saturday, July 02, 2005 12:37:34 PM
Here is the test driver for it
/* * Created on Jun 25, 2005 * * TODO To change the template for this generated file go to * Window - Preferences - Java - Code Style - Code Templates */ package com.ai.lucene; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.net.MalformedURLException; import org.apache.lucene.demo.html.HTMLParser; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.TermFreqVector; public class Test { public static String url= " =DisplayNoteURL&ownerUserId=satya&reportId=1358"; public static void main(String[] args) { try { //testLocateUrl(); //setupAUrl(); getTermVectors(); } catch(Throwable t) { Log.log(t); } } public static void testIndexWriter() throws IOException { Log.log("Start"); LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); IndexWriter writer = li.getWriter(); writer.close(); Log.log("finish"); } public static void testIndexReader() throws IOException { Log.log("Start"); LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); IndexReader reader = li.getReader(); int numberOfDocs = reader.numDocs(); Log.log("Number of docs in the index:" + numberOfDocs); Log.log("finish"); } public static void testIndexAURL() throws IOException { Log.log("Start"); LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); li.indexAURLWithIdentity(Test.url); Log.log("finish indexing the url with identity"); } public static void deleteIndex() throws IOException { Log.log("Start"); LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); li.deleteIndex(); Log.log("finish deleting index"); } public static Document testLocateUrl() throws IOException { testIndexAURL(); testIndexReader(); Log.log("Start"); LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); Document reply = li.locateIndexedUrl(Test.url); if (reply != null) { Log.log("success"); } else { Log.log("failure"); } Log.log("Finish"); return reply; } public static void runS1() throws IOException { deleteIndex(); testIndexWriter(); testIndexReader(); testIndexAURL(); } public static void setupAUrl() throws IOException { deleteIndex(); testIndexAURL(); testLocateUrl(); } public static void getTermVectors() throws IOException { LuceneIndex li = LuceneIndexHolder.getLuceneIndex(); int docnum = li.locateIndexedUrlDocNumber(Test.url); IndexReader reader = li.getReader(); Log.log("Index has " + reader.numDocs() + " documents."); TermFreqVector tfv = reader.getTermFreqVector(docnum,"content"); Log.log(tfv.toString()); } }//eof-class
Satya - Saturday, July 02, 2005 12:38:56 PM
Here is the index holder class
package com.ai.lucene; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; public class LuceneIndexHolder { private static String m_path = "e:\\satya\\webapps\\lucene\\luceneindex"; private static LuceneIndex m_index = null; static { try { m_index = new LuceneIndex(m_path,new StandardAnalyzer()); m_index.createIndex(); } catch(IOException x) { //Sorry can't create an index throw new RuntimeException( "Could not create a lucene index"); } } public static LuceneIndex getLuceneIndex() { return m_index; } }
Satya - Tuesday, July 05, 2005 3:11:55 PM
Sorting a term frequency list of Lucene for high frequency words
package com.ai.lucene; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; public class WordFrequency { private static WordFrequencyComparator m_wordFrequencyComparator = new WordFrequencyComparator(); public String word; public int frequency; public WordFrequency(String inWord, int inFrequency) { word = inWord; frequency = inFrequency; } public String toString() { StringBuffer sbuf = new StringBuffer(); return frequency + ":" + word; } public static String toString(Iterator itr) { StringBuffer sbuf = new StringBuffer(); while(itr.hasNext()) { WordFrequency wf = (WordFrequency)itr.next(); sbuf.append(wf.toString()); sbuf.append("\n"); } return sbuf.toString(); } public static void sortByDescendingFrequency(List listOfWordFrequencies) { Collections.sort(listOfWordFrequencies, m_wordFrequencyComparator); } } class WordFrequencyComparator implements Comparator { public int compare(Object arg0, Object arg1) { WordFrequency wf1 = (WordFrequency)arg0; WordFrequency wf2 = (WordFrequency)arg1; return CompareUtils.descendingIntCompare( wf1.frequency ,wf2.frequency); } }