/*
* Created on Jun 25, 2005
*
*/
package com.ai.lucene;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.MalformedURLException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
/**
* @author satya
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class LuceneIndex
{
//Index directory
private String m_directory = null;
//Analyzer for this index
private Analyzer m_analyzer =
new StandardAnalyzer();
public LuceneIndex(String path, Analyzer inAnalyzer)
{
m_directory = path;
m_analyzer = inAnalyzer;
}
/**
* Get a lucene index writer for the given index
* Close the writer once you are done
* -- Not sure if createWriter is a better name for this method --
* @return IndexWriter
* @throws IOException
*/
public IndexWriter getWriter()
throws IOException
{
//Open an index for writing
//Directory path
//Don't create the index
IndexWriter m_indexWriter = new IndexWriter(
m_directory,
m_analyzer,
false);
return m_indexWriter;
}
/**
* get an index reader for this lucene index at this directory
* You can not instantiate an indexreader directly
* @return
* @throws IOException
*/
public IndexReader getReader() throws IOException
{
return IndexReader.open(m_directory);
}
public Searcher getIndexSearcher() throws IOException
{
return new IndexSearcher(getReader());
}
/**
* Creates an index if it doesn't exist.
* If it exists, no action is taken.
*
* Useful for initialization code.
*/
public void createIndex() throws IOException
{
//create an index before any operation on this index takes place
if (IndexReader.indexExists(m_directory) == true)
{
Log.log("Index already exists");
return;
}
//Index is not there create it
IndexWriter m_indexWriter = new IndexWriter(
m_directory,
m_analyzer,
true);
m_indexWriter.close();
Log.log("Index created successfully");
}//eof-method
/**
* Erase the current index and create a new one
*
*/
public void recreateIndex() throws IOException
{
IndexWriter m_indexWriter = new IndexWriter(
m_directory,
m_analyzer,
true);
m_indexWriter.close();
}
public void deleteIndex() throws IOException
{
recreateIndex();
}
/**
* Index the contents of a url into this index
* @param inUrl
* @throws MalformedURLException
* @throws IOException
*/
public void indexAURL(String inUrl)
throws MalformedURLException, IOException
{
InputStream is = null;
Reader htmlReader = null;
IndexWriter writer = null;
try
{
//Open the url
java.net.URL url = new java.net.URL(inUrl);
is = url.openStream();
//Parse the content
HTMLParser parser = new HTMLParser(is);
htmlReader = parser.getReader();
//Create a document
Document doc = new Document();
//Add a field to the document
doc.add(Field.Text("content",htmlReader));
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
writer = li.getWriter();
writer.addDocument(doc);
writer.optimize();
//writer.close();
}
finally
{
if (writer != null) writer.close();
if (htmlReader != null) htmlReader.close();
if (is != null) is.close();
}
}//eof-functin
/**
* Index the contents of a url into this index. Use the url as a unique identifier
* If the url already exists don't index it
* @param inUrl
* @throws MalformedURLException
* @throws IOException
*/
public void indexAURLWithIdentity(String inUrl)
throws MalformedURLException, IOException
{
Document prevDoc = this.locateIndexedUrl(inUrl);
if (prevDoc != null)
{
//Document already exists
return;
}
alwaysIndexAURLWithIdentity(inUrl);
}//eof-functin
public void reIndexAURLWithIdentity(String inUrl) throws IOException
{
Document prevDoc = this.locateIndexedUrl(inUrl);
if (prevDoc != null)
{
//Document already exists
//delete the document;
this.deleteAURLIndex(inUrl);
}
// no document at this time
alwaysIndexAURLWithIdentity(inUrl);
}
public void alwaysIndexAURLWithIdentity(String inUrl)
throws MalformedURLException, IOException
{
InputStream is = null;
Reader htmlReader = null;
IndexWriter writer = null;
try
{
//Open the url
java.net.URL url = new java.net.URL(inUrl);
is = url.openStream();
//Parse the content
HTMLParser parser = new HTMLParser(is);
htmlReader = parser.getReader();
//Create a document
Document doc = new Document();
//Add a field to the document
doc.add(Field.Text("content",htmlReader,true));
doc.add(Field.Keyword("docid",inUrl));
doc.add(Field.UnIndexed("url",inUrl));
writer = getWriter();
writer.addDocument(doc);
writer.optimize();
//writer.close();
}
finally
{
if (writer != null) writer.close();
if (htmlReader != null) htmlReader.close();
if (is != null) is.close();
}
}//eof-functin
/**
* Retrieve a document based on an earlier indexed url
* @param inUrl
* @return
*/
public Document locateIndexedUrl(String inUrl) throws IOException
{
Searcher s = null;
try
{
Term term = new Term("docid",inUrl);
TermQuery termQuery = new TermQuery(term);
s = getIndexSearcher();
Hits hits = s.search(termQuery);
if (hits.length() == 0)
{
return null;
}
if (hits.length() > 1)
{
throw new RuntimeException("Too many hits.");
}
return hits.doc(0);
}
finally
{
if (s != null) s.close();
}
}//eof-function
public int locateIndexedUrlDocNumber(String inUrl) throws IOException
{
Searcher s = null;
try
{
Term term = new Term("docid",inUrl);
TermQuery termQuery = new TermQuery(term);
s = getIndexSearcher();
Hits hits = s.search(termQuery);
if (hits.length() == 0)
{
throw new RuntimeException("Document not found");
}
if (hits.length() > 1)
{
throw new RuntimeException("Too many hits.");
}
return hits.id(0);
}
finally
{
if (s != null) s.close();
}
}//eof-function
public void deleteAURLIndex(String inUrl) throws IOException
{
IndexReader reader = getReader();
Term term = new Term("docid",inUrl);
reader.delete(term);
}
}//eof-class
Satya - Saturday, July 02, 2005 12:37:34 PM
Here is the test driver for it
/*
* Created on Jun 25, 2005
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package com.ai.lucene;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.MalformedURLException;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermFreqVector;
public class Test {
public static String url=
"http://216.187.231.34/akc/servlet/DisplayServlet?url
=DisplayNoteURL&ownerUserId=satya&reportId=1358";
public static void main(String[] args)
{
try
{
//testLocateUrl();
//setupAUrl();
getTermVectors();
}
catch(Throwable t)
{
Log.log(t);
}
}
public static void testIndexWriter() throws IOException
{
Log.log("Start");
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
IndexWriter writer = li.getWriter();
writer.close();
Log.log("finish");
}
public static void testIndexReader() throws IOException
{
Log.log("Start");
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
IndexReader reader = li.getReader();
int numberOfDocs = reader.numDocs();
Log.log("Number of docs in the index:" + numberOfDocs);
Log.log("finish");
}
public static void testIndexAURL() throws IOException
{
Log.log("Start");
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
li.indexAURLWithIdentity(Test.url);
Log.log("finish indexing the url with identity");
}
public static void deleteIndex() throws IOException
{
Log.log("Start");
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
li.deleteIndex();
Log.log("finish deleting index");
}
public static Document testLocateUrl() throws IOException
{
testIndexAURL();
testIndexReader();
Log.log("Start");
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
Document reply =
li.locateIndexedUrl(Test.url);
if (reply != null)
{
Log.log("success");
}
else
{
Log.log("failure");
}
Log.log("Finish");
return reply;
}
public static void runS1() throws IOException
{
deleteIndex();
testIndexWriter();
testIndexReader();
testIndexAURL();
}
public static void setupAUrl() throws IOException
{
deleteIndex();
testIndexAURL();
testLocateUrl();
}
public static void getTermVectors() throws IOException
{
LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
int docnum = li.locateIndexedUrlDocNumber(Test.url);
IndexReader reader = li.getReader();
Log.log("Index has " + reader.numDocs() + " documents.");
TermFreqVector tfv =
reader.getTermFreqVector(docnum,"content");
Log.log(tfv.toString());
}
}//eof-class
Satya - Saturday, July 02, 2005 12:38:56 PM
Here is the index holder class
package com.ai.lucene;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class LuceneIndexHolder
{
private static String m_path =
"e:\\satya\\webapps\\lucene\\luceneindex";
private static LuceneIndex m_index = null;
static
{
try
{
m_index =
new LuceneIndex(m_path,new StandardAnalyzer());
m_index.createIndex();
}
catch(IOException x)
{
//Sorry can't create an index
throw new RuntimeException(
"Could not create a lucene index");
}
}
public static LuceneIndex getLuceneIndex()
{
return m_index;
}
}
Satya - Tuesday, July 05, 2005 3:11:55 PM
Sorting a term frequency list of Lucene for high frequency words
package com.ai.lucene;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
public class WordFrequency
{
private static WordFrequencyComparator
m_wordFrequencyComparator =
new WordFrequencyComparator();
public String word;
public int frequency;
public WordFrequency(String inWord, int inFrequency)
{
word = inWord;
frequency = inFrequency;
}
public String toString()
{
StringBuffer sbuf = new StringBuffer();
return frequency + ":" + word;
}
public static String toString(Iterator itr)
{
StringBuffer sbuf = new StringBuffer();
while(itr.hasNext())
{
WordFrequency wf = (WordFrequency)itr.next();
sbuf.append(wf.toString());
sbuf.append("\n");
}
return sbuf.toString();
}
public static void
sortByDescendingFrequency(List listOfWordFrequencies)
{
Collections.sort(listOfWordFrequencies,
m_wordFrequencyComparator);
}
}
class WordFrequencyComparator implements Comparator
{
public int compare(Object arg0, Object arg1)
{
WordFrequency wf1 = (WordFrequency)arg0;
WordFrequency wf2 = (WordFrequency)arg1;
return
CompareUtils.descendingIntCompare(
wf1.frequency
,wf2.frequency);
}
}