@incollection{fdi:010085545, title = {{A} framework for statistically-sound customer segment search}, author = {{A}mer-{Y}ahia, {S}. and {B}erti-{E}quille, {L}aure and {C}hibah, {A}.}, editor = {}, language = {{ENG}}, abstract = {{W}e develop {S}4, a {S}tatistically-{S}ound {S}egment {S}earch framework that combines principled data partitioning and sound statistical testing to verify common hypotheses in retail data and return interpretable customer data segments. {O}ur framework accommodates one-sample, two-sample, and multiple-sample testing, to provide various aggregations and comparisons of customer transactions. {T}o control the proportion of false discoveries in multiple hypothesis testing, we enforce an {FDR}-controlling procedure and formulate a unified optimization problem that returns customer data segments that satisfy the test for a given significance level, maximize coverage of the input data, and are within a risk capital. {W}e develop a greedy algorithm to explore different data partitions and test multiple hypotheses in a sound manner. {O}ur extensive experiments on four retail data sets examine the interaction between significance, risk and coverage, and demonstrate the expressivity, usefulness, and scalability of {S}4 in practice.}, keywords = {}, booktitle = {2021 {IEEE} 8th {I}nternational {C}onference on {D}ata {S}cience and {A}dvanced {A}nalytics ({DSAA})}, numero = {}, pages = {1--10}, address = {{P}iscataway}, publisher = {{IEEE}}, series = {}, year = {2021}, DOI = {10.1109/{DSAA}53316.2021.9564199}, ISBN = {978-1-6654-2100-3}, URL = {https://www.documentation.ird.fr/hor/fdi:010085545}, }