@article{Gopi_Gulhane_Kulkarni_Shen_Shokouhi_Yekhanin_2021, title={Differentially Private Set Union}, volume={11}, url={https://journalprivacyconfidentiality.org/index.php/jpc/article/view/780}, DOI={10.29012/jpc.780}, abstractNote={<p>We study the basic operation of set union in the global model of differential privacy. In this problem, we are given a universe $U$ of items, possibly of infinite size, and a database $D$ of users. Each user $i$ contributes a subset $W_i \subseteq U$ of items. We want an ($\epsilon$,$\delta$)-differentially private algorithm which outputs a subset $S \subset \cup_i W_i$ such that the size of $S$ is as large as possible. The problem arises in countless real world applications; it is particularly ubiquitous in natural language processing (NLP) applications as vocabulary extraction. For example, discovering words, sentences, $n$-grams etc., from private text data belonging to users is an instance of the set union problem.Known algorithms for this problem proceed by collecting a subset of items from each user, taking the union of such subsets, and disclosing the items whose noisy counts fall above a certain threshold. Crucially, in the above process, the contribution of each individual user is always independent of the items held by other users, resulting in a wasteful aggregation process, where some item counts happen to be way above the threshold. We deviate from the above paradigm by allowing users to contribute their items in a {\em dependent fashion}, guided by a {\em policy}. In this new setting ensuring privacy is significantly delicate. We prove that any policy which has certain {\em contractive} properties would result in a differentially private algorithm. We design two new algorithms for differentially private set union, one using Laplace noise and other Gaussian noise, which use $\ell_1$-contractive and $\ell_2$-contractive policies respectively and provide concrete examples of such policies. Our experiments show that the new algorithms in combination with our policies significantly outperform previously known mechanisms for the problem.</p>}, number={3}, journal={Journal of Privacy and Confidentiality}, author={Gopi, Sivakanth and Gulhane, Pankaj and Kulkarni, Janardhan and Shen, Judy Hanwen and Shokouhi, Milad and Yekhanin, Sergey}, year={2021}, month={Dec.} }