1-Mar-10 (Created: 1-Mar-10) | More in 'Lucene'

15.00 java/Samplecode: lucene

 
/*
 * Created on Jun 25, 2005
 *
 */
package com.ai.lucene;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.MalformedURLException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;

/**
 * @author satya
 *
 * TODO To change the template for this generated type comment go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
public class LuceneIndex 
{
	//Index directory
	private String m_directory = null;
	
	//Analyzer for this index
    private Analyzer m_analyzer =
    	new StandardAnalyzer();
	
    public LuceneIndex(String path, Analyzer inAnalyzer)
    {
    	m_directory = path;
    	m_analyzer = inAnalyzer;
    }
    

    /**
     * Get a lucene index writer for the given index
     * Close the writer once you are done
     * -- Not sure if createWriter is a better name for this method --
     * @return IndexWriter
     * @throws IOException
     */
	public IndexWriter getWriter()
		throws IOException
	{
		//Open an index for writing
		//Directory path
		//Don't create the index
		IndexWriter m_indexWriter = new IndexWriter(
				m_directory,
					m_analyzer,
						false);
		return m_indexWriter;
	}
	/**
	 * get an index reader for this lucene index at this directory
	 * You can not instantiate an indexreader directly
	 * @return
	 * @throws IOException
	 */
	public IndexReader getReader() throws IOException
	{
		return IndexReader.open(m_directory);
	}
	public Searcher getIndexSearcher() throws IOException
	{
		return new IndexSearcher(getReader());
	}

	/**
	 * Creates an index if it doesn't exist.
	 * If it exists, no action is taken.
	 * 
	 * Useful for initialization code.
	 */
	public void createIndex() throws IOException
	{
		//create an index before any operation on this index takes place
		if (IndexReader.indexExists(m_directory) == true)
		{
			Log.log("Index already exists");
			return;
		}
		
		//Index is not there create it
		IndexWriter m_indexWriter = new IndexWriter(
			m_directory,
			m_analyzer,
			true);
		
		m_indexWriter.close();
		Log.log("Index created successfully");
	}//eof-method
	
	/**
	 * Erase the current index and create a new one
	 *
	 */
	public void recreateIndex() throws IOException
	{
		IndexWriter m_indexWriter = new IndexWriter(
				m_directory,
				m_analyzer,
				true);
			
		m_indexWriter.close();
	}
	
	public void deleteIndex() throws IOException
	{
		recreateIndex();
	}

	/**
	 * Index the contents of a url into this index
	 * @param inUrl
	 * @throws MalformedURLException
	 * @throws IOException
	 */
	public void indexAURL(String inUrl)
		throws MalformedURLException, IOException
	{
		InputStream is = null;
		Reader htmlReader = null;
		IndexWriter writer = null;
		
		try
		{
			//Open the url
	        java.net.URL url = new java.net.URL(inUrl);
	        is = url.openStream();
	        
	        //Parse the content
			HTMLParser parser = new HTMLParser(is);
			htmlReader = parser.getReader();
			
			//Create a document
			Document doc = new Document();
			
			//Add a field to the document
			doc.add(Field.Text("content",htmlReader));
			
			LuceneIndex li = LuceneIndexHolder.getLuceneIndex();
			writer = li.getWriter();
			writer.addDocument(doc);
			writer.optimize();
			//writer.close();
		}
		finally
		{
			if (writer != null) writer.close();
			if (htmlReader != null) htmlReader.close();
			if (is != null) is.close();
		}
	}//eof-functin
	/**
	 * Index the contents of a url into this index. Use the url as a unique identifier
	 * If the url already exists don't index it
	 * @param inUrl
	 * @throws MalformedURLException
	 * @throws IOException
	 */
	public void indexAURLWithIdentity(String inUrl)
		throws MalformedURLException, IOException
	{
		
		Document prevDoc = this.locateIndexedUrl(inUrl);
		if (prevDoc != null)
		{
			//Document already exists
			return;
		}
		alwaysIndexAURLWithIdentity(inUrl);
	}//eof-functin
	
	public void reIndexAURLWithIdentity(String inUrl) throws IOException
	{
		Document prevDoc = this.locateIndexedUrl(inUrl);
		if (prevDoc != null)
		{
			//Document already exists
			//delete the document;
			this.deleteAURLIndex(inUrl);
		}
		// no document at this time
		alwaysIndexAURLWithIdentity(inUrl);
	}
	
	public void alwaysIndexAURLWithIdentity(String inUrl)
		throws MalformedURLException, IOException
	{
		
		InputStream is = null;
		Reader htmlReader = null;
		IndexWriter writer = null;
		
		try
		{
			//Open the url
	        java.net.URL url = new java.net.URL(inUrl);
	        is = url.openStream();
	        
	        //Parse the content
			HTMLParser parser = new HTMLParser(is);
			htmlReader = parser.getReader();
			
			//Create a document
			Document doc = new Document();
			
			//Add a field to the document
			doc.add(Field.Text("content",htmlReader,true));
			doc.add(Field.Keyword("docid",inUrl));
			doc.add(Field.UnIndexed("url",inUrl));
			
			writer = getWriter();
			writer.addDocument(doc);
			writer.optimize();
			//writer.close();
		}
		finally
		{
			if (writer != null) writer.close();
			if (htmlReader != null) htmlReader.close();
			if (is != null) is.close();
		}
	}//eof-functin
	/**
	 * Retrieve a document based on an earlier indexed url
	 * @param inUrl
	 * @return
	 */
	public Document locateIndexedUrl(String inUrl) throws IOException
	{
		Searcher s = null;
		try
		{
			Term term = new Term("docid",inUrl);
			TermQuery termQuery = new TermQuery(term);
			s = getIndexSearcher(); 
			Hits hits = s.search(termQuery);
			if (hits.length() == 0)
			{
				return null;
			}
			if (hits.length() > 1)
			{
				throw new RuntimeException("Too many hits.");
			}
			return hits.doc(0);
		}
		finally
		{
			if (s != null) s.close();
		}
		
	}//eof-function

	public int locateIndexedUrlDocNumber(String inUrl) throws IOException
	{
		Searcher s = null;
		try
		{
			Term term = new Term("docid",inUrl);
			TermQuery termQuery = new TermQuery(term);
			s = getIndexSearcher(); 
			Hits hits = s.search(termQuery);
			if (hits.length() == 0)
			{
				throw new RuntimeException("Document not found");
			}
			if (hits.length() > 1)
			{
				throw new RuntimeException("Too many hits.");
			}
			return hits.id(0);
		}
		finally
		{
			if (s != null) s.close();
		}
		
	}//eof-function
	
	public void deleteAURLIndex(String inUrl) throws IOException
	{
		IndexReader reader = getReader();
		Term term = new Term("docid",inUrl);
		reader.delete(term);
	}
}//eof-class