Search

Search Lucene Index created in database using JdbcDirectory

Posted on Updated on

In our last excercise we created Lucene index in database using JdbcDirectory which comes with Compass in this post we will search against index created. With this much of text let’s get our hand dirty and write some code.

Please note following files will be used from create lucene index post

  1. pom.xml
  2. MyJDBCDirectory.java
  3. JDBCBatchInsert.java
  4. JDBCIndexer.java (without creating index we cannot search against it ;), so in our test case below we will create an index before searching)
  5. JDBCDatabaseUtil.java
  6. Database schema

Now with some code from our backyard with let’s finish up search quickly. Below is JDBCSearcher.java which has a very simple search method which takes the name of the index field and value which we want to search. It returns true if valid search is performed or else false. Please note with this search we are only interested in search result with highest hit, others we don’t care.

package com.mumz.test.lucene.jdbc;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectorySearcher.
 * 
 * @author prabhat.jha
 */
public class JDBCSearcher {

	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param directory
	 *            the directory
	 */
	public JDBCSearcher(Directory directory) {
		this.directory = directory;
	}

	/**
	 * Search.
	 * 
	 * @param fileName
	 *            the file name
	 * @return the string
	 */
	public boolean search(String columnName, String value) {
		IndexSearcher indexSearcher = null;
		try {
			/**
			 * Specify the version
			 */
			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
			/**
			 * Create query columnname (index name passed), we built out index for name, author and publisher so
			 * we have to search against the same.
			 */
			Query query = new QueryParser(Version.LUCENE_36, columnName, analyzer).parse(value);
			IndexReader indexReader = IndexReader.open(directory);
			indexSearcher = new IndexSearcher(indexReader);
			/**
			 * This will hold all the results which results from the search
			 * operation
			 */
			TopDocs topDocs = indexSearcher.search(query, 1);
			if (topDocs.scoreDocs.length > 0) {
				System.out.println("Found :  Book with id = " + indexSearcher.doc(topDocs.scoreDocs[0].doc).get("BOOKID") + " , Name = "
						+ indexSearcher.doc(topDocs.scoreDocs[0].doc).get("name") + " ,Author = "
						+ indexSearcher.doc(topDocs.scoreDocs[0].doc).get("author") + " ,Publisher = "
						+ indexSearcher.doc(topDocs.scoreDocs[0].doc).get("publisher") + " with hits : " + topDocs.scoreDocs[0].doc);
				return true;
			} else {
				System.out.println("No Record found");
				return false;
			}
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (indexSearcher != null) {
				try {
					indexSearcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			indexSearcher = null;
		}
		return false;
	}
}

In you want to see all the hits you can use below code snippet.

TopScoreDocCollector results = TopScoreDocCollector.create(100, true);
TopDocs topDocs = indexSearcher.search(query, 1);
indexSearcher.search(query, results);
ScoreDoc[] scores = results.topDocs().scoreDocs;
for (ScoreDoc scoreDoc : scores) {
	System.out.println("Found :  Book with id = " + indexSearcher.doc(scoreDoc.doc).get("id") + " ,        
    Name = " + indexSearcher.doc(scoreDoc.doc).get("name") + " ,Author = " + 
    indexSearcher.doc(scoreDoc.doc).get("author") + " ,Publisher = " +   
    indexSearcher.doc(scoreDoc.doc).get("publisher") + " with hits : " + scoreDoc.score);

}

And finally here is our JUnit test case, please note this test case assumes that you also have indexer code with you.

package com.mumz.test.lucene.jdbc;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.jdbc.dialect.MySQLDialect;

import junit.framework.TestCase;

/**
 * The Class LuceneJDBCTest.
 * @author prabhat.jha
 */
public class LuceneJDBCTest extends TestCase {

	/** The directory. */
	private Directory	directory	= null;

       /** (non-Javadoc)
	 * @see junit.framework.TestCase#setUp()
	 */
	protected void setUp() throws Exception {
		directory = new MyJDBCDirectory(JDBCDatabaseUtil.getDataSource(), new MySQLDialect(), "LUCENE_INDEX_TABLE");
		super.setUp();
	}

	/**
	 * Test insert record.
	 */
	public void testInsertRecord() {		
		new JDBCBatchInsert().insertRecords();
	}

	/**
	 * Test build index.
	 */
	public void testBuildIndex() {
		new JDBCIndexer(directory).buildIndex();
	}

	/**
	 * Test search record on name.
	 */
	public void testSearchRecordOnName() {
		boolean found = new JDBCSearcher(directory).search("name", "Spring In Action");
		assertEquals(found, true);
	}

	/**
	 * Test search record fail on name.
	 */
	public void testSearchRecordFailOnName() {
		boolean found = new JDBCSearcher(directory).search("name", "No Such BookName");
		assertEquals(found, false);
	}

	/**
	 * Test search record on author.
	 */
	public void testSearchRecordOnAuthor() {
		boolean found = new JDBCSearcher(directory).search("author", "Test Author Hibernate In Action10");
		assertEquals(found, true);
	}

	/**
	 * Test search record fail on author.
	 */
	public void testSearchRecordFailOnAuthor() {
		boolean found = new JDBCSearcher(directory).search("name", "No Such Author");
		assertEquals(found, false);
	}

	/**
	 * Test search record on publisher.
	 */
	public void testSearchRecordOnPublisher() {
		boolean found = new JDBCSearcher(directory).search("publisher", "Test Publisher Spring Bible7");
		assertEquals(found, true);
	}

	/**
	 * Test search record fail on publisher.
	 */
	public void testSearchRecordFailOnPublisher() {
		boolean found = new JDBCSearcher(directory).search("name", "No Such Publisher");
		assertEquals(found, false);
	}
	
	/* (non-Javadoc)
	 * @see junit.framework.TestCase#tearDown()
	 */
	protected void tearDown() throws Exception {
		if(directory != null) {
			directory.close();
		}
		super.tearDown();
	}
}

Index and Search a Directory using Apache Lucene

Posted on

Building a custom search is a common requirement in almost every application. Building such a system can be complex and tedious with numerous use cases around. Apache Lucene is a search framework built in Java. In this tutorial we will write a small application which will use Lucene to search a given directory.

Lucene works on mainly two concepts:

  1. Index : Lucene provides great search throughput by building an index on the content and then searching those indexes. Content is divided into smaller units and indexed.
  2. Search : Once indexing is complete any query can be compared with the indexes and result is obtained.

First we will build our Eclipse Maven Project. Below is our pom.xml.

pom.xml

<project 
	xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation=
		"http://maven.apache.org/POM/4.0.0 
		http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mumz.test.lucene</groupId>
	<artifactId>ApacheLuceneTest</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>ApacheLuceneTest</name>
	<description>ApacheLuceneTest</description>
	<dependencies>
		<dependency>
			<artifactId>lucene-core</artifactId>
			<groupId>org.apache.lucene</groupId>
			<type>jar</type>
			<version>3.6.1</version>
		</dependency>
	</dependencies>
</project>

Second we will write out Indexer. Indexer accepts a path to directory and then recursively reads content of that directory and builds an in-memory index. Smaller units in Lucene is called as Document and you can add multiple Fields to your document. Here we are adding file name to our index and we are calling it as title.

InMemoryDirectoryIndexer.java


package com.mumz.test.lucene.first;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectoryIndexer.
 * @author prabhat.jha
 */
public class InMemoryDirectoryIndexer {
	
	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param path
	 *            the path
	 */
	public InMemoryDirectoryIndexer(String path) {
		indexDirectory(path);
	}

	/**
	 * Gets the directory.
	 * 
	 * @return the directory
	 */
	public Directory getDirectory() {
		if (directory == null) {
			throw new IllegalStateException("Index hasn't been built, check configratuion and reIndex");
		}
		return directory;
	}

	/**
	 * Index directory.
	 * 
	 * @param path
	 *            the path
	 */
	private void indexDirectory(String path) {
		/**
		 * We will create an in memory search, don't do this if you are indexing a huge amount of data
		 * as this can lead to heavy memory foot print.
		 */
		directory = new RAMDirectory();
		/**
		 * Specify the version, if there is a change in lucene version we have to reindex
		 */
		Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);
		IndexWriter indexWriter = null;
		try {
			/** 
			 * Create index writer
			 */
			indexWriter = new IndexWriter(directory, indexWriterConfig);
			File root = new File(path);
			/**
			 * Add files recursively to index writer
			 */
			addIndex(indexWriter, root);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (indexWriter != null) {
					indexWriter.close();
				}
				indexWriter = null;
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	
	/**
	 * Adds the index.
	 * 
	 * @param indexWriter
	 *            the index writer
	 * @param root
	 *            the root
	 * @throws CorruptIndexException
	 *             the corrupt index exception
	 * @throws IOException
	 *             the IO exception
	 */
	private void addIndex(IndexWriter indexWriter, File root) throws CorruptIndexException, IOException{
		for (File file : root.listFiles()) {
			if(file.isDirectory()){
				addIndex(indexWriter, file);
			}
			Document document = new Document();
			document.add(new Field("title", file.getName(), Field.Store.YES, Field.Index.ANALYZED));
			indexWriter.addDocument(document);
		}
	}
}

Third we will write our Searcher, code which searches against the index built in the last step.
InMemoryDirectorySearcher.java


package com.mumz.test.lucene.first;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectorySearcher.
 * @author prabhat.jha
 */
public class InMemoryDirectorySearcher {
	
	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param directory
	 *            the directory
	 */
	public InMemoryDirectorySearcher(Directory directory) {
		this.directory = directory;
	}

	/**
	 * Search.
	 * 
	 * @param fileName
	 *            the file name
	 * @return the string
	 */
	public void search(String fileName) {
		IndexSearcher indexSearcher = null;
		try {
			/**
			 * Specify the version
			 */
			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
			/**
			 * Create query from the file name, we built out index for title so we have to search against the same.
			 */
			Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(fileName);
			IndexReader indexReader = IndexReader.open(directory);
			indexSearcher = new IndexSearcher(indexReader);
			/**
			 * This will hold all the results which results from the search operation
			 */
			TopScoreDocCollector results = TopScoreDocCollector.create(100, true);
			indexSearcher.search(query, results);
			ScoreDoc[] scores = results.topDocs().scoreDocs;
			for (ScoreDoc scoreDoc : scores) {
				System.out.println("Found : " + indexSearcher.doc(scoreDoc.doc).get("title") + " with hits : " + scoreDoc.score);
			}
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if(indexSearcher != null){
				try {
					indexSearcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			indexSearcher = null;
		}
	}
}

Final step is to write a simple java class to showcase our indexer and searcher in action.
InMemoryMainApp.java


package com.mumz.test.lucene.first;

import java.util.Scanner;

/**
 * The Class InMemoryMainApp.
 * @author prabhat.jha
 */
public class InMemoryMainApp {
	
	/**
	 * The main method.
	 * 
	 * @param args
	 *            the args
	 */
	public static void main(String[] args) {
		Scanner scanner = new Scanner(System.in);
		System.out.println("Enter directory name");
		String directory = scanner.next();
		System.out.println("Should Index ? Enter Y for Yes or else q or quit to exit.");
		String response = scanner.next();
		if("q".equalsIgnoreCase(response) || "quit".equalsIgnoreCase(response)){
			scanner.close();
			System.exit(1);
		}
		if("y".equalsIgnoreCase(response)){
			InMemoryDirectoryIndexer inMemoryDirectoryIndexer = new InMemoryDirectoryIndexer(directory);
			while(true){
				System.out.println("Enter fileName to query or quit to exit.");
				String fileName = scanner.next();
				if("Quit".equalsIgnoreCase(fileName)){
					scanner.close();
					System.exit(1);
				}
				InMemoryDirectorySearcher inMemoryFileSearcher = new InMemoryDirectorySearcher(inMemoryDirectoryIndexer.getDirectory());
				inMemoryFileSearcher.search(fileName);
			}
		}
	}
}

That’s all, we have just touched tip of the iceberg. It is a vast and very interesting space, let’s see how much we can cover in coming days.