Search

Index and Search a Directory using Apache Lucene

Posted on

Building a custom search is a common requirement in almost every application. Building such a system can be complex and tedious with numerous use cases around. Apache Lucene is a search framework built in Java. In this tutorial we will write a small application which will use Lucene to search a given directory.

Lucene works on mainly two concepts:

  1. Index : Lucene provides great search throughput by building an index on the content and then searching those indexes. Content is divided into smaller units and indexed.
  2. Search : Once indexing is complete any query can be compared with the indexes and result is obtained.

First we will build our Eclipse Maven Project. Below is our pom.xml.

pom.xml

<project 
	xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation=
		"http://maven.apache.org/POM/4.0.0 
		http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mumz.test.lucene</groupId>
	<artifactId>ApacheLuceneTest</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>ApacheLuceneTest</name>
	<description>ApacheLuceneTest</description>
	<dependencies>
		<dependency>
			<artifactId>lucene-core</artifactId>
			<groupId>org.apache.lucene</groupId>
			<type>jar</type>
			<version>3.6.1</version>
		</dependency>
	</dependencies>
</project>

Second we will write out Indexer. Indexer accepts a path to directory and then recursively reads content of that directory and builds an in-memory index. Smaller units in Lucene is called as Document and you can add multiple Fields to your document. Here we are adding file name to our index and we are calling it as title.

InMemoryDirectoryIndexer.java


package com.mumz.test.lucene.first;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectoryIndexer.
 * @author prabhat.jha
 */
public class InMemoryDirectoryIndexer {
	
	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param path
	 *            the path
	 */
	public InMemoryDirectoryIndexer(String path) {
		indexDirectory(path);
	}

	/**
	 * Gets the directory.
	 * 
	 * @return the directory
	 */
	public Directory getDirectory() {
		if (directory == null) {
			throw new IllegalStateException("Index hasn't been built, check configratuion and reIndex");
		}
		return directory;
	}

	/**
	 * Index directory.
	 * 
	 * @param path
	 *            the path
	 */
	private void indexDirectory(String path) {
		/**
		 * We will create an in memory search, don't do this if you are indexing a huge amount of data
		 * as this can lead to heavy memory foot print.
		 */
		directory = new RAMDirectory();
		/**
		 * Specify the version, if there is a change in lucene version we have to reindex
		 */
		Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);
		IndexWriter indexWriter = null;
		try {
			/** 
			 * Create index writer
			 */
			indexWriter = new IndexWriter(directory, indexWriterConfig);
			File root = new File(path);
			/**
			 * Add files recursively to index writer
			 */
			addIndex(indexWriter, root);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (indexWriter != null) {
					indexWriter.close();
				}
				indexWriter = null;
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	
	/**
	 * Adds the index.
	 * 
	 * @param indexWriter
	 *            the index writer
	 * @param root
	 *            the root
	 * @throws CorruptIndexException
	 *             the corrupt index exception
	 * @throws IOException
	 *             the IO exception
	 */
	private void addIndex(IndexWriter indexWriter, File root) throws CorruptIndexException, IOException{
		for (File file : root.listFiles()) {
			if(file.isDirectory()){
				addIndex(indexWriter, file);
			}
			Document document = new Document();
			document.add(new Field("title", file.getName(), Field.Store.YES, Field.Index.ANALYZED));
			indexWriter.addDocument(document);
		}
	}
}

Third we will write our Searcher, code which searches against the index built in the last step.
InMemoryDirectorySearcher.java


package com.mumz.test.lucene.first;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectorySearcher.
 * @author prabhat.jha
 */
public class InMemoryDirectorySearcher {
	
	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param directory
	 *            the directory
	 */
	public InMemoryDirectorySearcher(Directory directory) {
		this.directory = directory;
	}

	/**
	 * Search.
	 * 
	 * @param fileName
	 *            the file name
	 * @return the string
	 */
	public void search(String fileName) {
		IndexSearcher indexSearcher = null;
		try {
			/**
			 * Specify the version
			 */
			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
			/**
			 * Create query from the file name, we built out index for title so we have to search against the same.
			 */
			Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(fileName);
			IndexReader indexReader = IndexReader.open(directory);
			indexSearcher = new IndexSearcher(indexReader);
			/**
			 * This will hold all the results which results from the search operation
			 */
			TopScoreDocCollector results = TopScoreDocCollector.create(100, true);
			indexSearcher.search(query, results);
			ScoreDoc[] scores = results.topDocs().scoreDocs;
			for (ScoreDoc scoreDoc : scores) {
				System.out.println("Found : " + indexSearcher.doc(scoreDoc.doc).get("title") + " with hits : " + scoreDoc.score);
			}
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if(indexSearcher != null){
				try {
					indexSearcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			indexSearcher = null;
		}
	}
}

Final step is to write a simple java class to showcase our indexer and searcher in action.
InMemoryMainApp.java


package com.mumz.test.lucene.first;

import java.util.Scanner;

/**
 * The Class InMemoryMainApp.
 * @author prabhat.jha
 */
public class InMemoryMainApp {
	
	/**
	 * The main method.
	 * 
	 * @param args
	 *            the args
	 */
	public static void main(String[] args) {
		Scanner scanner = new Scanner(System.in);
		System.out.println("Enter directory name");
		String directory = scanner.next();
		System.out.println("Should Index ? Enter Y for Yes or else q or quit to exit.");
		String response = scanner.next();
		if("q".equalsIgnoreCase(response) || "quit".equalsIgnoreCase(response)){
			scanner.close();
			System.exit(1);
		}
		if("y".equalsIgnoreCase(response)){
			InMemoryDirectoryIndexer inMemoryDirectoryIndexer = new InMemoryDirectoryIndexer(directory);
			while(true){
				System.out.println("Enter fileName to query or quit to exit.");
				String fileName = scanner.next();
				if("Quit".equalsIgnoreCase(fileName)){
					scanner.close();
					System.exit(1);
				}
				InMemoryDirectorySearcher inMemoryFileSearcher = new InMemoryDirectorySearcher(inMemoryDirectoryIndexer.getDirectory());
				inMemoryFileSearcher.search(fileName);
			}
		}
	}
}

That’s all, we have just touched tip of the iceberg. It is a vast and very interesting space, let’s see how much we can cover in coming days.