@article{fdi:010053838,
title = {{A}n exhaustive, non-euclidean, non-parametric data mining tool for unraveling the complexity of biological systems : novel insights into malaria},
author = {{L}oucoubar, {C}. and {P}aul, {R}. and {B}ar-{H}en, {A}. and {H}uret, {A}. and {T}all, {A}. and {S}okhna, {C}heikh and {T}rape, {J}ean-{F}ran{\c{c}}ois and {L}y, {A}. {B}. and {F}aye, {J}. and {B}adiane, {A}. and {D}iakhaby, {G}. and {S}arr, {F}. {D}. and {D}iop, {A}. and {S}akuntabhai, {A}. and {B}ureau, {J}. {F}.},
editor = {},
language = {{ENG}},
abstract = {{C}omplex, high-dimensional data sets pose significant analytical challenges in the post-genomic era. {S}uch data sets are not exclusive to genetic analyses and are also pertinent to epidemiology. {T}here has been considerable effort to develop hypothesis-free data mining and machine learning methodologies. {H}owever, current methodologies lack exhaustivity and general applicability. {H}ere we use a novel non-parametric, non-euclidean data mining tool, {H}yper{C}ube ({R}), to explore exhaustively a complex epidemiological malaria data set by searching for over density of events in m-dimensional space. {H}otspots of over density correspond to strings of variables, rules, that determine, in this case, the occurrence of {P}lasmodium falciparum clinical malaria episodes. {T}he data set contained 46,837 outcome events from 1,653 individuals and 34 explanatory variables. {T}he best predictive rule contained 1,689 events from 148 individuals and was defined as: individuals present during 1992-2003, aged 1-5 years old, having hemoglobin {AA}, and having had previous {P}lasmodium malariae malaria parasite infection <= 10 times. {T}hese individuals had 3.71 times more {P}. falciparum clinical malaria episodes than the general population. {W}e validated the rule in two different cohorts. {W}e compared and contrasted the {H}yper{C}ube ({R}) rule with the rules using variables identified by both traditional statistical methods and non-parametric regression tree methods. {I}n addition, we tried all possible sub-stratified quantitative variables. {N}o other model with equal or greater representativity gave a higher {R}elative {R}isk. {A}lthough three of the four variables in the rule were intuitive, the effect of number of {P}. malariae episodes was not. {H}yper{C}ube ({R}) efficiently sub-stratified quantitative variables to optimize the rule and was able to identify interactions among the variables, tasks not easy to perform using standard data mining methods. {S}earch of local over density in m-dimensional space, explained by easily interpretable rules, is thus seemingly ideal for generating hypotheses for large datasets to unravel the complexity inherent in biological systems.},
keywords = {},
booktitle = {},
journal = {{P}los {O}ne},
volume = {6},
numero = {9},
pages = {e24085},
ISSN = {1932-6203},
year = {2011},
DOI = {10.1371/journal.pone.0024085},
URL = {https://www.documentation.ird.fr/hor/fdi:010053838},
}