Lucene

Search Lucene Index created in database using JdbcDirectory

Posted on Updated on

In our last excercise we created Lucene index in database using JdbcDirectory which comes with Compass in this post we will search against index created. With this much of text let’s get our hand dirty and write some code.

Please note following files will be used from create lucene index post

  1. pom.xml
  2. MyJDBCDirectory.java
  3. JDBCBatchInsert.java
  4. JDBCIndexer.java (without creating index we cannot search against it ;), so in our test case below we will create an index before searching)
  5. JDBCDatabaseUtil.java
  6. Database schema

Now with some code from our backyard with let’s finish up search quickly. Below is JDBCSearcher.java which has a very simple search method which takes the name of the index field and value which we want to search. It returns true if valid search is performed or else false. Please note with this search we are only interested in search result with highest hit, others we don’t care.

package com.mumz.test.lucene.jdbc;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectorySearcher.
 * 
 * @author prabhat.jha
 */
public class JDBCSearcher {

	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param directory
	 *            the directory
	 */
	public JDBCSearcher(Directory directory) {
		this.directory = directory;
	}

	/**
	 * Search.
	 * 
	 * @param fileName
	 *            the file name
	 * @return the string
	 */
	public boolean search(String columnName, String value) {
		IndexSearcher indexSearcher = null;
		try {
			/**
			 * Specify the version
			 */
			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
			/**
			 * Create query columnname (index name passed), we built out index for name, author and publisher so
			 * we have to search against the same.
			 */
			Query query = new QueryParser(Version.LUCENE_36, columnName, analyzer).parse(value);
			IndexReader indexReader = IndexReader.open(directory);
			indexSearcher = new IndexSearcher(indexReader);
			/**
			 * This will hold all the results which results from the search
			 * operation
			 */
			TopDocs topDocs = indexSearcher.search(query, 1);
			if (topDocs.scoreDocs.length > 0) {
				System.out.println("Found :  Book with id = " + indexSearcher.doc(topDocs.scoreDocs[0].doc).get("BOOKID") + " , Name = "
						+ indexSearcher.doc(topDocs.scoreDocs[0].doc).get("name") + " ,Author = "
						+ indexSearcher.doc(topDocs.scoreDocs[0].doc).get("author") + " ,Publisher = "
						+ indexSearcher.doc(topDocs.scoreDocs[0].doc).get("publisher") + " with hits : " + topDocs.scoreDocs[0].doc);
				return true;
			} else {
				System.out.println("No Record found");
				return false;
			}
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (indexSearcher != null) {
				try {
					indexSearcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			indexSearcher = null;
		}
		return false;
	}
}

In you want to see all the hits you can use below code snippet.

TopScoreDocCollector results = TopScoreDocCollector.create(100, true);
TopDocs topDocs = indexSearcher.search(query, 1);
indexSearcher.search(query, results);
ScoreDoc[] scores = results.topDocs().scoreDocs;
for (ScoreDoc scoreDoc : scores) {
	System.out.println("Found :  Book with id = " + indexSearcher.doc(scoreDoc.doc).get("id") + " ,        
    Name = " + indexSearcher.doc(scoreDoc.doc).get("name") + " ,Author = " + 
    indexSearcher.doc(scoreDoc.doc).get("author") + " ,Publisher = " +   
    indexSearcher.doc(scoreDoc.doc).get("publisher") + " with hits : " + scoreDoc.score);

}

And finally here is our JUnit test case, please note this test case assumes that you also have indexer code with you.

package com.mumz.test.lucene.jdbc;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.jdbc.dialect.MySQLDialect;

import junit.framework.TestCase;

/**
 * The Class LuceneJDBCTest.
 * @author prabhat.jha
 */
public class LuceneJDBCTest extends TestCase {

	/** The directory. */
	private Directory	directory	= null;

       /** (non-Javadoc)
	 * @see junit.framework.TestCase#setUp()
	 */
	protected void setUp() throws Exception {
		directory = new MyJDBCDirectory(JDBCDatabaseUtil.getDataSource(), new MySQLDialect(), "LUCENE_INDEX_TABLE");
		super.setUp();
	}

	/**
	 * Test insert record.
	 */
	public void testInsertRecord() {		
		new JDBCBatchInsert().insertRecords();
	}

	/**
	 * Test build index.
	 */
	public void testBuildIndex() {
		new JDBCIndexer(directory).buildIndex();
	}

	/**
	 * Test search record on name.
	 */
	public void testSearchRecordOnName() {
		boolean found = new JDBCSearcher(directory).search("name", "Spring In Action");
		assertEquals(found, true);
	}

	/**
	 * Test search record fail on name.
	 */
	public void testSearchRecordFailOnName() {
		boolean found = new JDBCSearcher(directory).search("name", "No Such BookName");
		assertEquals(found, false);
	}

	/**
	 * Test search record on author.
	 */
	public void testSearchRecordOnAuthor() {
		boolean found = new JDBCSearcher(directory).search("author", "Test Author Hibernate In Action10");
		assertEquals(found, true);
	}

	/**
	 * Test search record fail on author.
	 */
	public void testSearchRecordFailOnAuthor() {
		boolean found = new JDBCSearcher(directory).search("name", "No Such Author");
		assertEquals(found, false);
	}

	/**
	 * Test search record on publisher.
	 */
	public void testSearchRecordOnPublisher() {
		boolean found = new JDBCSearcher(directory).search("publisher", "Test Publisher Spring Bible7");
		assertEquals(found, true);
	}

	/**
	 * Test search record fail on publisher.
	 */
	public void testSearchRecordFailOnPublisher() {
		boolean found = new JDBCSearcher(directory).search("name", "No Such Publisher");
		assertEquals(found, false);
	}
	
	/* (non-Javadoc)
	 * @see junit.framework.TestCase#tearDown()
	 */
	protected void tearDown() throws Exception {
		if(directory != null) {
			directory.close();
		}
		super.tearDown();
	}
}

Full Text Search with Hibernate Search 4.1, Lucene and JPA

Posted on Updated on

Earlier we worked directly with Lucene API to create and search index

  1. Index and Search a Directory using Apache Lucene
  2. Create Lucene Index in database using JdbcDirectory”>

Instead we can use HibernateSearch which internally uses Lucene functionality to index and search content. With that let’s get some code behind us. We will extend our code from JPA OneToMany Unidirectional without Join Table.

First let’s add hibernate-search lib in our pom.
pom.xml

<project
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xmlns="http://maven.apache.org/POM/4.0.0"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mumz.test.hibernatesearch</groupId>
	<artifactId>MumzHibernateSearch</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>
	<name>MumzHibernateSearch</name>
	<url>http://maven.apache.org</url>
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>
	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.hibernate</groupId>
			<artifactId>hibernate-search</artifactId>
			<version>4.1.1.Final</version>
		</dependency>
		<dependency>
			<groupId>org.hibernate.javax.persistence</groupId>
			<artifactId>hibernate-jpa-2.0-api</artifactId>
			<version>1.0.1.Final</version>
		</dependency>
		<dependency>
			<groupId>org.hibernate</groupId>
			<artifactId>hibernate-entitymanager</artifactId>
			<version>4.1.7.Final</version>
		</dependency>
		<dependency>
			<groupId>org.hibernate</groupId>
			<artifactId>hibernate-core</artifactId>
			<version>4.1.7.Final</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.21</version>
		</dependency>
	</dependencies>
</project>

Next we will add hibernate search specific annotation to our Entity classes, first MHSBookEntityBean.

MHSBookEntityBean.java

package com.mumz.test.hibernatesearch.entitybeans;

import java.io.Serializable;

import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.Table;

import org.hibernate.search.annotations.Analyze;
import org.hibernate.search.annotations.DocumentId;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Index;
import org.hibernate.search.annotations.Indexed;
import org.hibernate.search.annotations.Store;

/**
 * The Class MHSBookEntityBean.
 * @author prabhat.jha
 */
@Entity
@Table(name = "BOOK")
//This annotation tells hibernate search that this class has to be indexed
@Indexed
public class MHSBookEntityBean implements Serializable {
	
	/** The Constant serialVersionUID. */
	private static final long	serialVersionUID	= -5129783468137830152L;
	
	/** The id. */
	private Long				id					= null;
	
	/** The name. */
	private String				name				= null;
	
	/** The author. */
	private String				author				= null;
	
	/**
	 * Instantiates a new mHS book entity bean.
	 */
	public MHSBookEntityBean() {
		super();
	}
	
	/**
	 * Gets the id.
	 * 
	 * @return the id
	 */
	@Id
	// This is optional if @Id is present, hibernate search needs untokenized id to ensure index is unique
	@DocumentId
	@GeneratedValue(strategy = GenerationType.AUTO)
	@Column(name = "BOOK_ID")
	public Long getId() {
		return this.id;
	}
	
	/**
	 * Sets the id.
	 * 
	 * @param id
	 *            the new id
	 */
	public void setId(Long id) {
		this.id = id;
	}
	
	/**
	 * Gets the name.
	 * 
	 * @return the name
	 */
	@Column(name = "BOOK_NAME")
	// This annotation tells that this field has to be indexed and also analyzed (break the long sentence and ignore common words), store tells if this field
	// will be part of Index, by Store.Yes it means it will be part of Index, so that query will be faster, downside is that size of Index increases
	@Field(index = Index.YES, analyze = Analyze.YES, store = Store.NO)
	public String getName() {
		return this.name;
	}
	
	/**
	 * Sets the name.
	 * 
	 * @param name
	 *            the new name
	 */
	public void setName(String name) {
		this.name = name;
	}
	
	/**
	 * Gets the author.
	 * 
	 * @return the author
	 */
	@Column(name = "BOOK_AUTHOR")
	// This annotation tells that this field has to be indexed and also analyzed (break the long sentence and ignore common words), store tells if this field
	// will be part of Index, by Store.Yes it means it will be part of Index, so that query will be faster, downside is that size of Index increases
	@Field(index = Index.YES, analyze = Analyze.YES, store = Store.NO)
	public String getAuthor() {
		return author;
	}
	
	/**
	 * Sets the author.
	 * 
	 * @param author
	 *            the new author
	 */
	public void setAuthor(String author) {
		this.author = author;
	}
	
	/**
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#hashCode()
	 */
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((author == null) ? 0 : author.hashCode());
		result = prime * result + ((id == null) ? 0 : id.hashCode());
		result = prime * result + ((name == null) ? 0 : name.hashCode());
		return result;
	}
	
	/**
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#equals(java.lang.Object)
	 */
	@Override
	public boolean equals(Object obj) {
		if (this == obj) {
			return true;
		}
		if (obj == null) {
			return false;
		}
		if (!(obj instanceof MHSBookEntityBean)) {
			return false;
		}
		MHSBookEntityBean other = (MHSBookEntityBean) obj;
		if (author == null) {
			if (other.author != null) {
				return false;
			}
		} else if (!author.equals(other.author)) {
			return false;
		}
		if (id == null) {
			if (other.id != null) {
				return false;
			}
		} else if (!id.equals(other.id)) {
			return false;
		}
		if (name == null) {
			if (other.name != null) {
				return false;
			}
		} else if (!name.equals(other.name)) {
			return false;
		}
		return true;
	}
	
	/**
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return "MHSBookEntityBean [id=" + id + ", name=" + name + ", author=" + author + "]";
	}
}

Highlighted section in code above explains usage of each annotation.

Next we will update our MHSBookShelfEntityBean so that it can be indexed as well.

MHSBookShelfEntityBean.java

package com.mumz.test.hibernatesearch.entitybeans;

import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;

import javax.persistence.CascadeType;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.FetchType;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.JoinColumn;
import javax.persistence.OneToMany;
import javax.persistence.Table;

import org.hibernate.search.annotations.Analyze;
import org.hibernate.search.annotations.DocumentId;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Index;
import org.hibernate.search.annotations.Indexed;
import org.hibernate.search.annotations.Store;

/**
 * The Class MHSBookShelfEntityBean.
 * 
 * @author prabhat.jha
 */
@Entity
@Table(name = "BOOK_SHELF")
// This annotation tells hibernate search that this class has to be indexed
@Indexed
public class MHSBookShelfEntityBean implements Serializable {
	
	/** The Constant serialVersionUID. */
	private static final long		serialVersionUID	= -7127365575633206221L;
	
	/** The id. */
	private Long					id;
	
	/** The name. */
	private String					name;
	
	/** The books. */
	private Set<MHSBookEntityBean>	books				= new HashSet<MHSBookEntityBean>();
	
	/**
	 * Instantiates a new mHS book shelf entity bean.
	 */
	public MHSBookShelfEntityBean() {
		super();
	}
	
	/**
	 * Gets the id.
	 * 
	 * @return the id
	 */
	@Id
	// This is optional if @Id is present, hibernate search needs untokenized id to ensure index is unique
	@DocumentId
	@GeneratedValue(strategy = GenerationType.AUTO)
	@Column(name = "BOOK_SHELF_ID")
	public Long getId() {
		return this.id;
	}
	
	/**
	 * Sets the id.
	 * 
	 * @param id
	 *            the new id
	 */
	public void setId(Long id) {
		this.id = id;
	}
	
	/**
	 * Gets the name.
	 * 
	 * @return the name
	 */
	@Column(name = "BOOK_SHELF_NAME")
	// This annotation tells that this field has to be indexed and also analyzed (break the long sentence and ignore common words), store tells if this field
	// will be part of Index, by Store.Yes it means it will be part of Index, so that query will be faster, downside is that size of Index increases
	@Field(index = Index.YES, analyze = Analyze.YES, store = Store.NO)
	public String getName() {
		return this.name;
	}
	
	/**
	 * Sets the name.
	 * 
	 * @param name
	 *            the new name
	 */
	public void setName(String name) {
		this.name = name;
	}
	
	/**
	 * Gets the books.
	 * 
	 * @return the books
	 */
	@OneToMany(cascade = CascadeType.ALL, fetch = FetchType.EAGER)
	@JoinColumn(name = "BOOK_SHELF_ID", referencedColumnName = "BOOK_SHELF_ID")
	public Set<MHSBookEntityBean> getBooks() {
		return books;
	}
	
	/**
	 * Sets the books.
	 * 
	 * @param books
	 *            the new books
	 */
	public void setBooks(Set<MHSBookEntityBean> books) {
		this.books = books;
	}
	
	/**
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#hashCode()
	 */
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((books == null) ? 0 : books.hashCode());
		result = prime * result + ((id == null) ? 0 : id.hashCode());
		result = prime * result + ((name == null) ? 0 : name.hashCode());
		return result;
	}
	
	/**
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#equals(java.lang.Object)
	 */
	@Override
	public boolean equals(Object obj) {
		if (this == obj) {
			return true;
		}
		if (obj == null) {
			return false;
		}
		if (!(obj instanceof MHSBookShelfEntityBean)) {
			return false;
		}
		MHSBookShelfEntityBean other = (MHSBookShelfEntityBean) obj;
		if (books == null) {
			if (other.books != null) {
				return false;
			}
		} else if (!books.equals(other.books)) {
			return false;
		}
		if (id == null) {
			if (other.id != null) {
				return false;
			}
		} else if (!id.equals(other.id)) {
			return false;
		}
		if (name == null) {
			if (other.name != null) {
				return false;
			}
		} else if (!name.equals(other.name)) {
			return false;
		}
		return true;
	}
	
	/**
	 * (non-Javadoc)
	 * 
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return "MHSBookShelfEntityBean [id=" + id + ", name=" + name + ", books=" + books + "]";
	}
}

Finally we will write our code which will start indexing and then will search against index created.

TestHibernateSearch.java

package com.mumz.test.hibernatesearch.entitybeans;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

import javax.persistence.EntityManager;
import javax.persistence.Persistence;
import javax.persistence.Query;

import org.hibernate.search.jpa.FullTextEntityManager;
import org.hibernate.search.jpa.Search;
import org.hibernate.search.query.dsl.QueryBuilder;

/**
 * The Class TestHibernateSearch.
 * 
 * @author prabhat.jha
 */
public class TestHibernateSearch {
	
	/**
	 * The main method.
	 * 
	 * @param args
	 *            the arguments
	 */
	@SuppressWarnings("unchecked")
	public static void main(String[] args) {
		EntityManager entityManager = Persistence.createEntityManagerFactory("MumzHibernateSearch").createEntityManager();
		FullTextEntityManager fullTextEntityManager = Search.getFullTextEntityManager(entityManager);
		try {
			// This will ensure that index for already inserted data is created.
			fullTextEntityManager.createIndexer().startAndWait();
			// Add some more record, lucene will index every new object inserted, removed or updated.
			addMoreRecords(entityManager);
			// Search for Book
			QueryBuilder qb = fullTextEntityManager.getSearchFactory().buildQueryBuilder().forEntity(MHSBookEntityBean.class).get();
			org.apache.lucene.search.Query query = qb.keyword().onFields("name", "author").matching("Pro Android 4").createQuery();
			Query jpaQuery = fullTextEntityManager.createFullTextQuery(query, MHSBookEntityBean.class);
			
			// execute search
			List<MHSBookEntityBean> bookResult = jpaQuery.getResultList();
			
			if (bookResult != null) {
				for (MHSBookEntityBean mhsBookEntityBean : bookResult) {
					System.out.println("Book found = " + mhsBookEntityBean);
				}
			}
			// Seach for book shelf
			qb = fullTextEntityManager.getSearchFactory().buildQueryBuilder().forEntity(MHSBookShelfEntityBean.class).get();
			query = qb.keyword().onFields("name").matching("Technical").createQuery();
			jpaQuery = fullTextEntityManager.createFullTextQuery(query, MHSBookShelfEntityBean.class);
			
			// execute search
			List<MHSBookShelfEntityBean> bookShelfResult = jpaQuery.getResultList();
			
			if (bookShelfResult != null) {
				for (MHSBookShelfEntityBean mhsBookShelfEntityBean : bookShelfResult) {
					System.out.println("Book Shelf Found = " + mhsBookShelfEntityBean);
				}
			}
			
		} catch (InterruptedException e) {
			e.printStackTrace();
		} finally {
			if (fullTextEntityManager != null) {
				fullTextEntityManager.close();
			}
			fullTextEntityManager = null;
		}
	}
	
	/**
	 * Adds the more records.
	 * 
	 * @param entityManager
	 *            the entity manager
	 */
	private static void addMoreRecords(EntityManager entityManager) {
		Set<MHSBookEntityBean> books = new HashSet<MHSBookEntityBean>();
		MHSBookEntityBean mhsBookEntityBean = new MHSBookEntityBean();
		mhsBookEntityBean.setName("Pro Spring 3");
		mhsBookEntityBean.setAuthor("Clarence Ho and Rob Harrop");
		books.add(mhsBookEntityBean);
		mhsBookEntityBean = new MHSBookEntityBean();
		mhsBookEntityBean.setName("Pro JPA 2 Mastering the Java Persistence API");
		mhsBookEntityBean.setAuthor("Mike Keith and Merrick Schincariol");
		books.add(mhsBookEntityBean);
		
		// Fetch the book shelf id, in my db id was 3 so I added it as 3, you can use a query or something
		Query query = entityManager.createQuery("SELECT BOOKSHELF FROM " + MHSBookShelfEntityBean.class.getName() + " BOOKSHELF");
		MHSBookShelfEntityBean bookShelfEntityBean = (MHSBookShelfEntityBean) query.getSingleResult();
		bookShelfEntityBean.setName("Technical Books");
		bookShelfEntityBean.setBooks(books);
		
		entityManager.getTransaction().begin();
		entityManager.persist(bookShelfEntityBean);
		entityManager.getTransaction().commit();
	}
}

Lucene works on a concept of Directory which in this case we will use as FileSystem, since we are using JPA, we will provide these in persistence.xml.

persistence.xml

<?xml version="1.0" encoding="UTF-8"?>
<persistence xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    version="2.0"
    xmlns="http://java.sun.com/xml/ns/persistence"
    xsi:schemaLocation="http://java.sun.com/xml/ns/persistence http://java.sun.com/xml/ns/persistence/persistence_2_0.xsd" >

    <persistence-unit name="MumzHibernateSearch" >
    	<class>com.mumz.test.hibernatesearch.entitybeans.MHSBookEntityBean</class>
    	<class>com.mumz.test.hibernatesearch.entitybeans.MHSBookShelfEntityBean</class>
        <properties>
            <property name="hibernate.show_sql" value="true"/>
            <property name="hibernate.connection.driver_class" value="com.mysql.jdbc.Driver"/>
            <property name="hibernate.connection.password" value="root"/>
            <property name="hibernate.connection.url" value="jdbc:mysql://localhost/jpa_schema"/>
            <property name="hibernate.connection.username" value="root"/>
            <property name="hibernate.dialect" value="org.hibernate.dialect.MySQLDialect"/>
            <property name="hibernate.search.default.directory_provider" value="filesystem" />
            <property name="hibernate.search.default.indexBase" value="c:/lucene/indexes/first" />
        </properties>
    </persistence-unit>
</persistence>

That’s all we have to do get Hibernate search up and running.

Create Lucene Index in database using JdbcDirectory

Posted on Updated on

In our last post we built a simple index over file system. While our example works fine but cannot be extended over clustered environment and also cannot be used for a large document because of memory foot print. Lucene doesn’t provide a direct in built JDBC interface but Compass does, though the JDBC interface of Compass is not compatible with Lucene 3.6. We will extend Compass JDBC interface as per Lucene 3.6 changes.

First we will create our Maven Project in Eclipse and add required dependencies.

pom.xml

<project 
	xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation=
		"http://maven.apache.org/POM/4.0.0 
		http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mumz.test.lucene</groupId>
	<artifactId>ApacheLuceneTest</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>ApacheLuceneTest</name>
	<description>ApacheLuceneTest</description>
	<dependencies>
		<dependency>
			<artifactId>lucene-core</artifactId>
			<groupId>org.apache.lucene</groupId>
			<type>jar</type>
			<version>3.6.1</version>
		</dependency>
		<dependency>
			<groupId>org.compass-project</groupId>
			<artifactId>compass</artifactId>
			<version>2.2.0</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.21</version>
		</dependency>
	</dependencies>
</project>

Second since JdbcDirectory provided with Compass 2.2.0 doesn’t implement all the methods defined in abstract class Directory, we will provide our own implementation which will extend JdbcDirectory.

JdbcDirectory

package com.mumz.test.lucene.jdbc;

import java.io.IOException;

import javax.sql.DataSource;

import org.apache.lucene.store.jdbc.JdbcDirectory;
import org.apache.lucene.store.jdbc.JdbcDirectorySettings;
import org.apache.lucene.store.jdbc.JdbcStoreException;
import org.apache.lucene.store.jdbc.dialect.Dialect;
import org.apache.lucene.store.jdbc.support.JdbcTable;

/**
 * The Class MyJDBCDirectory.
 * 
 * @author prabhat.jha
 */
public class MyJDBCDirectory extends JdbcDirectory {

	/**
	 * Instantiates a new my jdbc directory.
	 * 
	 * @param dataSource
	 *            the data source
	 * @param dialect
	 *            the dialect
	 * @param settings
	 *            the settings
	 * @param tableName
	 *            the table name
	 */
	public MyJDBCDirectory(DataSource dataSource, Dialect dialect, JdbcDirectorySettings settings, String tableName) {
		super(dataSource, dialect, settings, tableName);
	}

	/**
	 * Instantiates a new my jdbc directory.
	 *
	 * @param dataSource the data source
	 * @param dialect the dialect
	 * @param tableName the table name
	 */
	public MyJDBCDirectory(DataSource dataSource, Dialect dialect, String tableName) {
		super(dataSource, dialect, tableName);
	}

	/**
	 * Instantiates a new my jdbc directory.
	 *
	 * @param dataSource the data source
	 * @param settings the settings
	 * @param tableName the table name
	 * @throws JdbcStoreException the jdbc store exception
	 */
	public MyJDBCDirectory(DataSource dataSource, JdbcDirectorySettings settings, String tableName) throws JdbcStoreException {
		super(dataSource, settings, tableName);
	}

	/**
	 * Instantiates a new my jdbc directory.
	 *
	 * @param dataSource the data source
	 * @param table the table
	 */
	public MyJDBCDirectory(DataSource dataSource, JdbcTable table) {
		super(dataSource, table);
	}

	/**
	 * Instantiates a new my jdbc directory.
	 *
	 * @param dataSource the data source
	 * @param tableName the table name
	 * @throws JdbcStoreException the jdbc store exception
	 */
	public MyJDBCDirectory(DataSource dataSource, String tableName) throws JdbcStoreException {
		super(dataSource, tableName);
	}

	/**
	 * (non-Javadoc).
	 *
	 * @return the string[]
	 * @throws IOException Signals that an I/O exception has occurred.
	 * @see org.apache.lucene.store.Directory#listAll()
	 */
	@Override
	public String[] listAll() throws IOException {
		return super.list();
	}
}

JdbcDirectory has a predefined table structure where it works on a clob field.

Third we will write our Indexer code. In our code we will achieve following:

  1. Add few records in our database by using JDBCBatchInsert.java
  2. Create table to be used by JdbcDirectory using createIndexTable method
  3. Build an index on this record by using index method

Let’s add some records in our database for testing our application.
JDBCBatchInsert.java

package com.mumz.test.lucene.jdbc;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

/**
 * The Class JDBCBatchInsert.
 * @author prabhat.jha
 */
public class JDBCBatchInsert {
	
	/** The Constant QUERY. */
	private static final String		QUERY			= "INSERT INTO BOOKS (BOOK_ID, BOOK_NAME, BOOK_AUTHOR, BOOK_PUBLISHER) VALUES (?, ?, ?, ?)";

	/** The Constant BOOK_FIRST_PART. */
	private final static String[]	BOOK_FIRST_PART	= {"Spring", "Hibernate", "Lucene", "Mahout", "JPA", "JSF", "Swing", "Hadoop", "Hbase"};

	/** The Constant BOOK_LAST_PART. */
	private final static String[]	BOOK_LAST_PART	= {"In Action", "Complete Reference", "Demystified", "Tutorial", "Explained",
			"Simplified", "Bible", "Cook Book", "Crash Course"};

	/** The Constant BLANK_SPACE. */
	private final static String		BLANK_SPACE		= " ";

	/**
	 * Insert records.
	 */
	public void insertRecords() {
		Connection connection = null;
		PreparedStatement pstmt = null;
		ResultSet resultSet = null;
		try {
			connection = JDBCDatabaseUtil.getConnection();
			pstmt = connection.prepareStatement(QUERY);
			int index = 0;
			for (String firstPart : BOOK_FIRST_PART) {
				for (String lastPart : BOOK_LAST_PART) {
					pstmt.setInt(1, ++index);
					pstmt.setString(2, firstPart + BLANK_SPACE + lastPart);
					pstmt.setString(3, "Test Author" + BLANK_SPACE + firstPart + BLANK_SPACE + lastPart + index);
					pstmt.setString(4, "Test Publisher" + BLANK_SPACE + firstPart + BLANK_SPACE + lastPart + index);
					pstmt.addBatch();
				}
			}
			pstmt.executeBatch();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (resultSet != null) {
					resultSet.close();
				}
				if (pstmt != null) {
					pstmt.close();
				}
				if (connection != null) {
					connection.close();
				}
				resultSet = null;
				pstmt = null;
				connection = null;
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}
}

Next we will add index in our database.
JDBCIndexer.java

package com.mumz.test.lucene.jdbc;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.jdbc.JdbcDirectory;
import org.apache.lucene.store.jdbc.dialect.MySQLDialect;
import org.apache.lucene.util.Version;

/**
 * The Class JDBCIndexer.
 * 
 * @author prabhat.jha
 */
public class JDBCIndexer {

	/** The jdbc directory. */
	private Directory	jdbcDirectory	= null;

	/**
	 * Instantiates a new jDBC indexer.
	 * 
	 * @param jdbcDirectory
	 *            the jdbc directory
	 */
	public JDBCIndexer(Directory jdbcDirectory) {
		super();
		this.jdbcDirectory = jdbcDirectory;
	}
	/**
	 * Gets the jdbc directory.
	 * 
	 * @return the jdbc directory
	 */
	public Directory getJdbcDirectory() {
		if (jdbcDirectory == null) {
			throw new IllegalStateException("Index not yet build, rerun indexing");
		}
		return jdbcDirectory;
	}

	/**
	 * Sets the jdbc directory.
	 * 
	 * @param jdbcDirectory
	 *            the new jdbc directory
	 */
	public void setJdbcDirectory(Directory jdbcDirectory) {
		this.jdbcDirectory = jdbcDirectory;
	}

	/**
	 * Builds the index.
	 */
	public void buildIndex() {
		createAndBuildIndex();
	}

	/**
	 * Creates the and build index.
	 */
	private void createAndBuildIndex() {
		createIndexTable();
		index();
	}

	/**
	 * Index.
	 */
	private void index() {
		Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(getJdbcDirectory(), indexWriterConfig);
			addIndex(indexWriter);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (indexWriter != null) {
				try {
					indexWriter.close();
				} catch (CorruptIndexException e) {
					e.printStackTrace();
				} catch (IOException e) {
					e.printStackTrace();
				} finally {
					indexWriter = null;
				}
			}
		}
	}

	/**
	 * Add index on records present in BOOKS table
	 * 
	 * @param indexWriter
	 *            the index writer
	 */
	private void addIndex(IndexWriter indexWriter) {
		try {
			Connection connection = JDBCDatabaseUtil.getConnection();
			String query = "SELECT BOOK_ID, BOOK_NAME, BOOK_AUTHOR, BOOK_PUBLISHER FROM BOOKS";
			PreparedStatement pstmt = connection.prepareStatement(query);
			ResultSet resultSet = pstmt.executeQuery();
			while (resultSet.next()) {
				Document document = new Document();
				document.add(new Field("name", String.valueOf(resultSet.getString(2)), Field.Store.YES, Field.Index.ANALYZED));
				document.add(new Field("author", String.valueOf(resultSet.getString(3)), Field.Store.YES, Field.Index.ANALYZED));
				document.add(new Field("publisher", String.valueOf(resultSet.getString(4)), Field.Store.YES, Field.Index.ANALYZED));
                indexWriter.addDocument(document);
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/**
	 * Creates the index table.
	 */
	private void createIndexTable() {
		if (this.jdbcDirectory == null) {
			setJdbcDirectory(new MyJDBCDirectory(JDBCDatabaseUtil.getDataSource(), new MySQLDialect(), "LUCENE_INDEX_TABLE"));
		}
		try {
			/**
			 * No need to manually create index table, create method will
			 * automatically create it.
			 */
			((JdbcDirectory) getJdbcDirectory()).create();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

Last we have used getDataSource and getConnection methods while inserting records and building index, these two methods are utility methods part of our database util.

JDBCDatabaseUtil.java


package com.mumz.test.lucene.jdbc;

import java.sql.Connection;
import java.sql.SQLException;

import javax.sql.DataSource;

import com.mysql.jdbc.jdbc2.optional.MysqlDataSource;

/**
 * The Class JDBCDatabaseUtil.
 * @author prabhat.jha
 */
public class JDBCDatabaseUtil {
	/**
	 * Gets the data source.
	 * 
	 * @return the data source
	 */
	public static DataSource getDataSource() {
		MysqlDataSource dataSource = new MysqlDataSource();
		dataSource.setUser("root");
		dataSource.setPassword("root");
		dataSource.setUrl("jdbc:mysql://localhost:3306/search_schema?emulateLocators=true&useUnicode=true&characterEncoding=UTF-8&useFastDateParsing=false");
		return dataSource;
	}

	/**
	 * Gets the connection.
	 * 
	 * @return the connection
	 * @throws SQLException
	 *             the sQL exception
	 */
	public static Connection getConnection() throws SQLException {
		return getDataSource().getConnection();
	}
}

Finally sql script to create our database schema which will be used in this tutorial.


SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';

DROP SCHEMA IF EXISTS `search_schema` ;
CREATE SCHEMA IF NOT EXISTS `search_schema` DEFAULT CHARACTER SET utf8 ;
USE `search_schema` ;

-- -----------------------------------------------------
-- Table `search_schema`.`books`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `search_schema`.`books` ;

CREATE  TABLE IF NOT EXISTS `search_schema`.`books` (
  `BOOK_ID` INT(11) NOT NULL AUTO_INCREMENT ,
  `BOOK_NAME` VARCHAR(45) NOT NULL ,
  `BOOK_AUTHOR` VARCHAR(45) NOT NULL ,
  `BOOK_PUBLISHER` VARCHAR(45) NOT NULL ,
  PRIMARY KEY (`BOOK_ID`) )
ENGINE = InnoDB
AUTO_INCREMENT = 82
DEFAULT CHARACTER SET = utf8;

SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

Integrating Apache Lucene and Maven in Eclipse

Posted on

In two steps we will integrate Apache Lucene and Maven in Eclipse.

First Create Java Project using Maven in Eclipse

Second add lucene dependency in your pom.xml, so you pom.xml should look like this:
pom.xml

<project 
	xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation=
		"http://maven.apache.org/POM/4.0.0 
		http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mumz.test.lucene</groupId>
	<artifactId>ApacheLuceneTest</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>ApacheLuceneTest</name>
	<description>ApacheLuceneTest</description>
	<dependencies>
		<dependency>
			<artifactId>lucene-core</artifactId>
			<groupId>org.apache.lucene</groupId>
			<type>jar</type>
			<version>3.6.1</version>
		</dependency>
	</dependencies>
</project>

That’s all.

Index and Search a Directory using Apache Lucene

Posted on

Building a custom search is a common requirement in almost every application. Building such a system can be complex and tedious with numerous use cases around. Apache Lucene is a search framework built in Java. In this tutorial we will write a small application which will use Lucene to search a given directory.

Lucene works on mainly two concepts:

  1. Index : Lucene provides great search throughput by building an index on the content and then searching those indexes. Content is divided into smaller units and indexed.
  2. Search : Once indexing is complete any query can be compared with the indexes and result is obtained.

First we will build our Eclipse Maven Project. Below is our pom.xml.

pom.xml

<project 
	xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation=
		"http://maven.apache.org/POM/4.0.0 
		http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.mumz.test.lucene</groupId>
	<artifactId>ApacheLuceneTest</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>ApacheLuceneTest</name>
	<description>ApacheLuceneTest</description>
	<dependencies>
		<dependency>
			<artifactId>lucene-core</artifactId>
			<groupId>org.apache.lucene</groupId>
			<type>jar</type>
			<version>3.6.1</version>
		</dependency>
	</dependencies>
</project>

Second we will write out Indexer. Indexer accepts a path to directory and then recursively reads content of that directory and builds an in-memory index. Smaller units in Lucene is called as Document and you can add multiple Fields to your document. Here we are adding file name to our index and we are calling it as title.

InMemoryDirectoryIndexer.java


package com.mumz.test.lucene.first;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectoryIndexer.
 * @author prabhat.jha
 */
public class InMemoryDirectoryIndexer {
	
	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param path
	 *            the path
	 */
	public InMemoryDirectoryIndexer(String path) {
		indexDirectory(path);
	}

	/**
	 * Gets the directory.
	 * 
	 * @return the directory
	 */
	public Directory getDirectory() {
		if (directory == null) {
			throw new IllegalStateException("Index hasn't been built, check configratuion and reIndex");
		}
		return directory;
	}

	/**
	 * Index directory.
	 * 
	 * @param path
	 *            the path
	 */
	private void indexDirectory(String path) {
		/**
		 * We will create an in memory search, don't do this if you are indexing a huge amount of data
		 * as this can lead to heavy memory foot print.
		 */
		directory = new RAMDirectory();
		/**
		 * Specify the version, if there is a change in lucene version we have to reindex
		 */
		Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);
		IndexWriter indexWriter = null;
		try {
			/** 
			 * Create index writer
			 */
			indexWriter = new IndexWriter(directory, indexWriterConfig);
			File root = new File(path);
			/**
			 * Add files recursively to index writer
			 */
			addIndex(indexWriter, root);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (indexWriter != null) {
					indexWriter.close();
				}
				indexWriter = null;
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	
	/**
	 * Adds the index.
	 * 
	 * @param indexWriter
	 *            the index writer
	 * @param root
	 *            the root
	 * @throws CorruptIndexException
	 *             the corrupt index exception
	 * @throws IOException
	 *             the IO exception
	 */
	private void addIndex(IndexWriter indexWriter, File root) throws CorruptIndexException, IOException{
		for (File file : root.listFiles()) {
			if(file.isDirectory()){
				addIndex(indexWriter, file);
			}
			Document document = new Document();
			document.add(new Field("title", file.getName(), Field.Store.YES, Field.Index.ANALYZED));
			indexWriter.addDocument(document);
		}
	}
}

Third we will write our Searcher, code which searches against the index built in the last step.
InMemoryDirectorySearcher.java


package com.mumz.test.lucene.first;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

/**
 * The Class InMemoryDirectorySearcher.
 * @author prabhat.jha
 */
public class InMemoryDirectorySearcher {
	
	/** The directory. */
	private Directory	directory	= null;

	/**
	 * The Constructor.
	 * 
	 * @param directory
	 *            the directory
	 */
	public InMemoryDirectorySearcher(Directory directory) {
		this.directory = directory;
	}

	/**
	 * Search.
	 * 
	 * @param fileName
	 *            the file name
	 * @return the string
	 */
	public void search(String fileName) {
		IndexSearcher indexSearcher = null;
		try {
			/**
			 * Specify the version
			 */
			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36);
			/**
			 * Create query from the file name, we built out index for title so we have to search against the same.
			 */
			Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(fileName);
			IndexReader indexReader = IndexReader.open(directory);
			indexSearcher = new IndexSearcher(indexReader);
			/**
			 * This will hold all the results which results from the search operation
			 */
			TopScoreDocCollector results = TopScoreDocCollector.create(100, true);
			indexSearcher.search(query, results);
			ScoreDoc[] scores = results.topDocs().scoreDocs;
			for (ScoreDoc scoreDoc : scores) {
				System.out.println("Found : " + indexSearcher.doc(scoreDoc.doc).get("title") + " with hits : " + scoreDoc.score);
			}
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if(indexSearcher != null){
				try {
					indexSearcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			indexSearcher = null;
		}
	}
}

Final step is to write a simple java class to showcase our indexer and searcher in action.
InMemoryMainApp.java


package com.mumz.test.lucene.first;

import java.util.Scanner;

/**
 * The Class InMemoryMainApp.
 * @author prabhat.jha
 */
public class InMemoryMainApp {
	
	/**
	 * The main method.
	 * 
	 * @param args
	 *            the args
	 */
	public static void main(String[] args) {
		Scanner scanner = new Scanner(System.in);
		System.out.println("Enter directory name");
		String directory = scanner.next();
		System.out.println("Should Index ? Enter Y for Yes or else q or quit to exit.");
		String response = scanner.next();
		if("q".equalsIgnoreCase(response) || "quit".equalsIgnoreCase(response)){
			scanner.close();
			System.exit(1);
		}
		if("y".equalsIgnoreCase(response)){
			InMemoryDirectoryIndexer inMemoryDirectoryIndexer = new InMemoryDirectoryIndexer(directory);
			while(true){
				System.out.println("Enter fileName to query or quit to exit.");
				String fileName = scanner.next();
				if("Quit".equalsIgnoreCase(fileName)){
					scanner.close();
					System.exit(1);
				}
				InMemoryDirectorySearcher inMemoryFileSearcher = new InMemoryDirectorySearcher(inMemoryDirectoryIndexer.getDirectory());
				inMemoryFileSearcher.search(fileName);
			}
		}
	}
}

That’s all, we have just touched tip of the iceberg. It is a vast and very interesting space, let’s see how much we can cover in coming days.