Manage this page

1. Back to lucene notes

2. Display

3. Feedback

 
/*
 * Created on Jun 25, 2005
 *
 */
package com.ai.lucene;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.MalformedURLException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;

/**
 * @author satya
 *
 * TODO To change the template for this generated type comment go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
public class LuceneIndex 
{
	//Index directory
	private String m_directory = null;
	
	//Analyzer for this index
    private Analyzer m_analyzer =
    	new StandardAnalyzer();
	
    public LuceneIndex(String path, Analyzer inAnalyzer)
    {
    	m_directory = path;
    	m_analyzer = inAnalyzer;
    }
    

    /**
     * Get a lucene index writer for the given index
     * Close the writer once you are done
     * -- Not sure if createWriter is a better name for this method --
     * @return IndexWriter
     * @throws IOException
     */
	public IndexWriter getWriter()
		throws IOException
	{
		//Open an index for writing
		//Directory path
		//Don't create the index
		IndexWriter m_indexWriter = new IndexWriter(
				m_directory,
					m_analyzer,
						false);
		return m_indexWriter;
	}
	/**
	 * get an index reader for this lucene index at this directory
	 * You can not instantiate an indexreader directly
	 * @return
	 * @throws IOException
	 */
	public IndexReader getReader() throws IOException
	{
		return IndexReader.open(m_directory);
	}
	public Searcher getIndexSearcher() throws IOException
	{
		return new IndexSearcher(getReader());
	}

	/**
	 * Creates an index if it doesn't exist.
	 * If it exists, no action is taken.
	 * 
	 * Useful for initialization code.
	 */
	public void createIndex() throws IOException
	{
		//create an index before any operation on this index takes place
		if (IndexReader.indexExists(m_directory) == true)
		{
			Log.log("Index already exists");
			return;
		}
		
		//Index is not there create it
		IndexWriter m_indexWriter = new IndexWriter(
			m_directory,
			m_analyzer,
			true);
		
		m_indexWriter.close();
		Log.log("Index created successfully");
	}//eof-method
	
	/**
	 * Erase the current index and create a new one
	 *
	 */
	public void recreateIndex() throws IOException
	{
		IndexWriter m_indexWriter = new IndexWriter(
				m_directory,
				m_analyzer,
				true);
			
		m_indexWriter.close();
	}
	
	public void deleteIndex() throws IOException
	{
		recreateIndex();
	}

	/**
	 * Index the contents of a url into this index
	 * @param inUrl
	 * @throws MalformedURLException
	 * @throws IOException
	 */
	public void indexAURL(String inUrl)
		throws MalformedURLException, IOException
	{
		InputStream is = null;
		Reader htmlReader = null;
		IndexWriter writer = null;
		
		try
		{
			//Open the url
	        java.net.URL url = new java.net.URL(inUrl);
	        is = url.openStream();
	        
	        //Parse the content
			HTMLParser parser = new HTMLParser(is);
			htmlReader = parser.getReader();
			
			//Create a document
			Document doc = new Document();
			
			//Add a field to the document
			doc.add(Field.Text("content",htmlReader));
			
			LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
			writer = li.getWriter();
			writer.addDocument(doc);
			writer.optimize();
			//writer.close();
		}
		finally
		{
			if (writer != null) writer.close();
			if (htmlReader != null) htmlReader.close();
			if (is != null) is.close();
		}
	}//eof-functin
	/**
	 * Index the contents of a url into this index. Use the url as a unique identifier
	 * If the url already exists don't index it
	 * @param inUrl
	 * @throws MalformedURLException
	 * @throws IOException
	 */
	public void indexAURLWithIdentity(String inUrl)
		throws MalformedURLException, IOException
	{
		
		Document prevDoc = this.locateIndexedUrl(inUrl);
		if (prevDoc != null)
		{
			//Document already exists
			return;
		}
		alwaysIndexAURLWithIdentity(inUrl);
	}//eof-functin
	
	public void reIndexAURLWithIdentity(String inUrl) throws IOException
	{
		Document prevDoc = this.locateIndexedUrl(inUrl);
		if (prevDoc != null)
		{
			//Document already exists
			//delete the document;
			this.deleteAURLIndex(inUrl);
		}
		// no document at this time
		alwaysIndexAURLWithIdentity(inUrl);
	}
	
	public void alwaysIndexAURLWithIdentity(String inUrl)
		throws MalformedURLException, IOException
	{
		
		InputStream is = null;
		Reader htmlReader = null;
		IndexWriter writer = null;
		
		try
		{
			//Open the url
	        java.net.URL url = new java.net.URL(inUrl);
	        is = url.openStream();
	        
	        //Parse the content
			HTMLParser parser = new HTMLParser(is);
			htmlReader = parser.getReader();
			
			//Create a document
			Document doc = new Document();
			
			//Add a field to the document
			doc.add(Field.Text("content",htmlReader,true));
			doc.add(Field.Keyword("docid",inUrl));
			doc.add(Field.UnIndexed("url",inUrl));
			
			writer = getWriter();
			writer.addDocument(doc);
			writer.optimize();
			//writer.close();
		}
		finally
		{
			if (writer != null) writer.close();
			if (htmlReader != null) htmlReader.close();
			if (is != null) is.close();
		}
	}//eof-functin
	/**
	 * Retrieve a document based on an earlier indexed url
	 * @param inUrl
	 * @return
	 */
	public Document locateIndexedUrl(String inUrl) throws IOException
	{
		Searcher s = null;
		try
		{
			Term term = new Term("docid",inUrl);
			TermQuery termQuery = new TermQuery(term);
			s = getIndexSearcher(); 
			Hits hits = s.search(termQuery);
			if (hits.length() == 0)
			{
				return null;
			}
			if (hits.length() > 1)
			{
				throw new RuntimeException("Too many hits.");
			}
			return hits.doc(0);
		}
		finally
		{
			if (s != null) s.close();
		}
		
	}//eof-function

	public int locateIndexedUrlDocNumber(String inUrl) throws IOException
	{
		Searcher s = null;
		try
		{
			Term term = new Term("docid",inUrl);
			TermQuery termQuery = new TermQuery(term);
			s = getIndexSearcher(); 
			Hits hits = s.search(termQuery);
			if (hits.length() == 0)
			{
				throw new RuntimeException("Document not found");
			}
			if (hits.length() > 1)
			{
				throw new RuntimeException("Too many hits.");
			}
			return hits.id(0);
		}
		finally
		{
			if (s != null) s.close();
		}
		
	}//eof-function
	
	public void deleteAURLIndex(String inUrl) throws IOException
	{
		IndexReader reader = getReader();
		Term term = new Term("docid",inUrl);
		reader.delete(term);
	}
}//eof-class

Satya - Saturday, July 02, 2005 12:37:34 PM

Here is the test driver for it

/*
 * Created on Jun 25, 2005
 *
 * TODO To change the template for this generated file go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
package com.ai.lucene;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.MalformedURLException;

import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermFreqVector;

public class Test {
	public static String url=
"http://216.187.231.34/akc/servlet/DisplayServlet?url
=DisplayNoteURL&ownerUserId=satya&reportId=1358";
		

	public static void main(String[] args) 
	{
		try
		{
			//testLocateUrl();
			//setupAUrl();
			getTermVectors();
		}
		catch(Throwable t)
		{
			Log.log(t);
		}
	}
	
	public static void testIndexWriter() throws IOException
	{
		Log.log("Start");
		LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
		IndexWriter writer = li.getWriter();
		writer.close();
		Log.log("finish");
	}
	public static void testIndexReader() throws IOException
	{
		Log.log("Start");
		LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
		IndexReader reader = li.getReader();
		int numberOfDocs = reader.numDocs();
		Log.log("Number of docs in the index:" + numberOfDocs);
		Log.log("finish");
	}
	
	public static void testIndexAURL() throws IOException
	{
		Log.log("Start");
		LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
		li.indexAURLWithIdentity(Test.url);
		Log.log("finish indexing the url with identity");
	}
	
	public static void deleteIndex() throws IOException
	{
		Log.log("Start");
		LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
		li.deleteIndex();
		Log.log("finish deleting index");
	}
	
	public static Document testLocateUrl() throws IOException
	{
		testIndexAURL();
		testIndexReader();
		Log.log("Start");
		LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
		Document reply = 
			li.locateIndexedUrl(Test.url);
		if (reply != null)
		{
			Log.log("success");
		}
		else
		{
			Log.log("failure");
		}
		Log.log("Finish");
		return reply;
	}
	
	public static void runS1() throws IOException
	{
		deleteIndex();
		testIndexWriter();
		testIndexReader();
		testIndexAURL();
	}
	public static void setupAUrl() throws IOException
	{
		deleteIndex();
		testIndexAURL();
		testLocateUrl();
	}
	
	public static void getTermVectors() throws IOException
	{
		LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
		int docnum = li.locateIndexedUrlDocNumber(Test.url);
		IndexReader reader = li.getReader();
		Log.log("Index has " + reader.numDocs() + " documents.");
		TermFreqVector tfv = 
                        reader.getTermFreqVector(docnum,"content");
		Log.log(tfv.toString());
	}
}//eof-class

Satya - Saturday, July 02, 2005 12:38:56 PM

Here is the index holder class

package com.ai.lucene;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class LuceneIndexHolder 
{
	private static String m_path = 
           "e:\\satya\\webapps\\lucene\\luceneindex";
	private static LuceneIndex m_index = null;
	static 
	{
		try
		{
			m_index = 
                         new LuceneIndex(m_path,new StandardAnalyzer());
			m_index.createIndex();
		}
		catch(IOException x)
		{
			//Sorry can't create an index
			throw new RuntimeException(
                        "Could not create a lucene index");
		}
	}
	public static LuceneIndex getLuceneIndex()
	{
		return m_index;
	}

}

Satya - Tuesday, July 05, 2005 3:11:55 PM

Sorting a term frequency list of Lucene for high frequency words

package com.ai.lucene;

import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

public class WordFrequency 
{
	private static WordFrequencyComparator 
             m_wordFrequencyComparator = 
                 new WordFrequencyComparator();

	public String word;
	public int frequency;
	public WordFrequency(String inWord, int inFrequency) 
	{
		word = inWord;
		frequency = inFrequency;
	}
	public String toString()
	{
		StringBuffer sbuf = new StringBuffer();
		return frequency + ":" + word;
	}
	public static String toString(Iterator itr)
	{
		StringBuffer sbuf = new StringBuffer();
		while(itr.hasNext())
		{
			WordFrequency  wf = (WordFrequency)itr.next();
			sbuf.append(wf.toString());
			sbuf.append("\n");
		}
		return sbuf.toString();
	}
	
	public static void 
        sortByDescendingFrequency(List listOfWordFrequencies)
	{
		Collections.sort(listOfWordFrequencies,
                              m_wordFrequencyComparator);
	}
}

class WordFrequencyComparator implements Comparator
{
	public int compare(Object arg0, Object arg1) 
	{
		WordFrequency wf1 = (WordFrequency)arg0;
		WordFrequency wf2 = (WordFrequency)arg1;
		return 
                    CompareUtils.descendingIntCompare(
                        wf1.frequency
                        ,wf2.frequency);
	}
}