@article{fdi:010064844, title = {{A} masking index for quantifying hidden glitches}, author = {{B}erti-{E}quille, {L}aure and {L}oh, {J}. {M}. and {D}asu, {T}.}, editor = {}, language = {{ENG}}, abstract = {{D}ata glitches are errors in a dataset. {T}hey are complex entities that often span multiple attributes and records. {W}hen they co-occur in data, the presence of one type of glitch can hinder the detection of another type of glitch. {T}his phenomenon is called masking. {I}n this paper, we define two important types of masking and propose a novel, statistically rigorous indicator called masking index for quantifying the hidden glitches. {W}e outline four cases of masking: outliers masked by missing values, outliers masked by duplicates, duplicates masked by missing values, and duplicates masked by outliers. {T}he masking index is critical for data quality profiling and data exploration. {I}t enables a user to measure the extent of masking and hence the confidence in the data. {I}n this sense, it is a valuable data quality index for choosing an anomaly detection method that is best suited for the glitches that are present in any given dataset. {W}e demonstrate the utility and effectiveness of the masking index by intensive experiments on synthetic and real-world datasets.}, keywords = {{A}nomaly detection ; {M}asking ; {D}uplicate record identification ; {M}issing values ; {O}utlier detection}, booktitle = {}, journal = {{K}nowledge and {I}nformation {S}ystems}, volume = {44}, numero = {2}, pages = {253--277}, ISSN = {0219-1377}, year = {2015}, DOI = {10.1007/s10115-014-0760-0}, URL = {https://www.documentation.ird.fr/hor/fdi:010064844}, }