@inproceedings{a20528e3b6894cf2b55733eb0bddeaa7,
title = "Parallel metagenomic sequence clustering via sketching and maximal quasi-clique enumeration on map-reduce clouds",
abstract = "Taxonomic clustering of species is an important and frequently arising problem in metagenomics. High-throughput next generation sequencing is facilitating the creation of large metagenomic samples, while at the same time making the clustering problem harder due to the short sequence length supported and unknown species sampled. In this paper, we present a parallel algorithm for hierarchical taxonomic clustering of large metagenomic samples with support for overlapping clusters. We adapt the sketching techniques originally developed for web document clustering to deduce significant similarities between pairs of sequences without resorting to expensive all vs. all alignments. We formulate the metagenomics classification problem as that of maximal quasi-clique enumeration in the resulting similarity graph, at multiple levels of the hierarchy as prescribed by different similarity thresholds. We cast execution of the underlying algorithmic steps as applications of the map-reduce framework to achieve a cloud based implementation. Apart from solving an important problem in metagenomics, this work demonstrates the applicability of map-reduce framework in relatively complicated algorithmic settings.",
keywords = "MapReduce, cloud computing, metagenomics, next generation sequencing, quasi clique enumeration, sequence clustering, sketching",
author = "Xiao Yang and Jaroslaw Zola and Srinivas Aluru",
year = "2011",
doi = "10.1109/IPDPS.2011.116",
language = "English",
isbn = "9780769543857",
series = "Proceedings - 25th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2011",
pages = "1223--1233",
booktitle = "Proceedings - 25th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2011",
note = "25th IEEE International Parallel and Distributed Processing Symposium, IPDPS 2011 ; Conference date: 16-05-2011 Through 20-05-2011",
}