@inproceedings{a86396ad7eaa4bc48a3b79379424fa65,
title = "Improving the Sensitivity of MinHash Through Hash-Value Analysis",
abstract = "MinHash sketching is an important algorithm for efficient document retrieval and bioinformatics. We show that the value of the matching MinHash codes convey additional information about the Jaccard similarity of S and T over and above the fact that the MinHash codes agree. This observation holds the potential to increase the sensitivity of minhash-based retrieval systems. We analyze the expected Jaccard similarity of two sets as a function of observing a matching MinHash value a under a reasonable prior distribution on intersection set sizes, and present a practical approach to using MinHash values to improve the sensitivity of traditional Jaccard similarity estimation, based on the Kolmogorov-Smirnov statistical test for sample distributions. Experiments over a wide range of hash function counts and set similarities show a small but consistent improvement over chance at predicting over/under-estimation, yielding an average accuracy of 61\% over the range of experiments.",
keywords = "MinHash sketching, hashing, sequence similarity",
author = "Gregory Kucherov and Steven Skiena",
note = "Publisher Copyright: {\textcopyright} 2023 Schloss Dagstuhl- Leibniz-Zentrum fur Informatik GmbH, Dagstuhl Publishing. All rights reserved.; 34th Annual Symposium on Combinatorial Pattern Matching, CPM 2023 ; Conference date: 26-06-2023 Through 28-06-2023",
year = "2023",
month = jun,
doi = "10.4230/LIPIcs.CPM.2023.20",
language = "English",
series = "Leibniz International Proceedings in Informatics, LIPIcs",
publisher = "Schloss Dagstuhl- Leibniz-Zentrum fur Informatik GmbH, Dagstuhl Publishing",
editor = "Laurent Bulteau and Zsuzsanna Liptak",
booktitle = "34th Annual Symposium on Combinatorial Pattern Matching, CPM 2023",
}