@inproceedings{22738fc592914840af4fd10420520d83,
title = "Efficient entity resolution for heterogeneous datasets",
abstract = "Entity resolution (ER) is the process of determining which records in a collection or collections represent the same entity. It has become an emerging challenge in this big data era. A number of techniques have been developed to improve duplicate elimination. The majority of these approaches employ blocking methods and focus on homogeneous data collections. Applying existing approaches to heterogeneous data collections may encounter both precision and efficiency difficulties. We present a new technique applicable for heterogeneous data that is more efficient than the existing techniques. The technique utilizes a selective comparison algorithm which not only provides a blocking scheme, but also a two-phase comparison selection process. Our approach uses a finer token selection process to avoid building oversized blocks. Then, it filters out blocks containing records that are not likely to match. In addition, we process the comparisons within blocks to resolve only those that are likely to be duplicates. As a result, we significantly reduce the number of comparisons and increase the number of detected duplicates. The results of our experimental studies demonstrate the usefulness of our algorithm with respect to both effectiveness and efficiency.",
keywords = "Bloom filters, Duplicate elimination, Entity resolution",
author = "Ewa Musial and Hwang, \{Jeong Hyon\} and Chen, \{Mei Hwa\} and Ravi, \{S. S.\}",
note = "Publisher Copyright: {\textcopyright} ISCA, SEDE 2014.; 23rd International Conference on Software Engineering and Data Engineering, SEDE 2014 ; Conference date: 13-10-2014 Through 15-10-2014",
year = "2014",
language = "English",
series = "23rd International Conference on Software Engineering and Data Engineering, SEDE 2014",
publisher = "International Society of Computers and Their Applications (ISCA)",
pages = "111--118",
editor = "Ling Ding and Yan Shi",
booktitle = "23rd International Conference on Software Engineering and Data Engineering, SEDE 2014",
}