@incollection{fdi:010055317, title = {{D}iscovery of complex glitch patterns : a novel approach to quantitative data cleaning}, author = {{B}erti-{E}quille, {L}aure and {D}asu, {T}. and {S}vrivastava, {D}.}, editor = {}, language = {{ENG}}, abstract = {{Q}uantitative {D}ata {C}leaning ({QDC}) is the use of statistical and other analytical techniques to detect, quantify, and correct data quality problems (or glitches). {C}urrent {QDC} approaches focus on addressing each category of data glitch individually. {H}owever, in real-world data, different types of data glitches co-occur in complex patterns. {T}hese patterns and interactions between glitches offer valuable clues for developing effective domain-specific quantitative cleaning strategies. {I}n this paper, we address the shortcomings of the extant {QDC} methods by proposing a novel framework, the {DEC} ({D}etect-{E}xplore-{C}lean) framework. {I}t is a comprehensive approach for the definition, detection and cleaning of complex, multi-type data glitches. {W}e exploit the distributions and interactions of different types of glitches to develop data-driven cleaning strategies that may offer significant advantages over blind strategies. {T}he {DEC} framework is a statistically rigorous methodology for evaluating and scoring glitches and selecting the quantitative cleaning strategies that result in cleaned data sets that are statistically proximal to user specifications. {W}e demonstrate the efficacy and scalability of the {DEC} framework on very large real-world and synthetic data sets}, keywords = {{RESEAU} {INFORMATIQUE} ; {TRAITEMENT} {DE} {DONNEES} ; {ERREUR} ; {METHODE} {D}'{ANALYSE} ; {ANALYSE} {STATISTIQUE}}, booktitle = {{P}roceedings of the 27th international conference on data engineering}, numero = {}, pages = {733--744}, address = {}, series = {{IEEE} {C}onference {P}ublication}, year = {2011}, DOI = {10.1109/{ICDE}.2011.5767864}, ISBN = {978-1-4244-9194-0}, URL = {https://www.documentation.ird.fr/hor/fdi:010055317}, }