Abstract:
In this paper, we propose a probabilistic algorithm for detecting near duplicate
text, audio, and video resources efficiently and effectively in
large-scale P2P systems. To this end, we present a thorough cost and
probabilistic analysis that allows the algorithm to adapt to
network and data collection characteristics for minimizing network
cost. In addition, we extend the algorithm so that it can identify
similar videos, even if some of the videos are split into different files.
A thorough theoretical analysis as well as a large-scale
experimental evaluation on networks of up to 100,000 peers using
real-world datasets of more than 200 Gbytes demonstrate the viability of our approach.
@inproceedings{nddOptimize,
author = {Odysseas Papapetrou, Sukriti Ramesh, Stefan Siersdorfer, Wolfgang Nejdl},
title = {Optimizing Near Duplicate Detection for P2P Networks},
booktitle = {IEEE International Conference on Peer-to-Peer Computing (P2P'10), Delft, Netherlands},
year = {2010}
}