@article{fdi:010095848, title = {{LUMA} : a benchmark dataset for learning from uncertain and multimodal data}, author = {{B}ezirganyan, {G}. and {S}ellami, {S}. and {B}erti-{E}quille, {L}aure and {F}ournier, {S}.}, editor = {}, language = {{ENG}}, abstract = {{M}ultimodal {D}eep {L}earning enhances decision-making by integrating diverse information sources, such as texts, images, audio, and videos. {T}o develop trustworthy multimodal approaches, it is essential to understand how uncertainty impacts these models. {W}e propose {LUMA}, a unique multimodal dataset, featuring audio, image, and textual data from 50 classes, specifically designed for learning from uncertain data. {I}t extends the well-known {CIFAR} 10/100 dataset with audio samples extracted from three audio corpora, and text data generated using the {G}emma-7{B} {L}arge {L}anguage {M}odel ({LLM}). {T}he {LUMA} dataset enables the controlled injection of varying types and degrees of uncertainty to achieve and tailor specific experiments and benchmarking initiatives. {LUMA} is also available as a {P}ython package including the functions for generating multiple variants of the dataset with controlling the diversity of the data, the amount of noise for each modality, and adding out-of-distribution samples. {A} baseline pre-trained model is also provided alongside three uncertainty quantification methods: {M}onte-{C}arlo {D}ropout, {D}eep {E}nsemble, and {R}eliable {C}onflictive {M}ulti-{V}iew {L}earning. {T}his comprehensive dataset and its tools are intended to promote and support the development, evaluation, and benchmarking of trustworthy and robust multimodal deep learning approaches. {W}e anticipate that the {LUMA} dataset will help the research community to design more trustworthy and robust machine learning approaches for safety critical applications. {T}he code and instructions for downloading and processing the dataset can be found at: https://github.com/bezirganyan/{LUMA}.}, keywords = {multimodal deep learning ; uncertainty quantification ; dataset}, booktitle = {}, journal = {{P}roceedings of the 48th {I}nternational {ACM} {SIGIR} {C}onference on {R}esearch and {D}evelopment in {I}nformation {R}etrieval, {S}igir 2025}, numero = {}, pages = {3782--3791}, year = {2025}, DOI = {10.1145/3726302.3730302}, URL = {https://www.documentation.ird.fr/hor/fdi:010095848}, }