@article{fdi:010083146, title = {{U}sing machine learning and big data to explore the drug resistance landscape in {HIV}}, author = {{B}lassel, {L}. and {T}ostevin, {A}. and {V}illabona-{A}renas, {C}. {J}. and {P}eeters, {M}artine and {H}ue, {S}. and {G}ascuel, {O}. and {U}k {H}iv {D}rug {R}esistance {D}atabase}, editor = {}, language = {{ENG}}, abstract = {{A}uthor summary {A}lmost all drugs to treat {HIV} target the {R}everse {T}ranscriptase ({RT}) and {D}rug resistance mutations ({DRM}s) appear in {HIV} under treatment pressure. {R}esistant strains can be transmitted and limit treatment options at the population level. {C}lassically, multiple statistical testing is used to find {DRM}s, by comparing virus sequences of treated and naive populations. {H}owever, with this method, each mutation is considered individually and we cannot hope to reveal any interaction (epistasis) between them. {H}ere, we used machine learning to discover new {DRM}s and study potential epistasis effects. {W}e applied this approach to a very large {UK} dataset comprising approximate to 55, 000 {RT} sequences. {R}esults robustness was checked on different {UK} and {A}frican datasets. {S}ix new mutations associated to resistance were found. {A}ll six have a low genetic barrier and show high correlations with known {DRM}s. {M}oreover, all these mutations are close to either the active site or the regulatory binding pocket of {RT}. {T}hus, they are good candidates for further wet experiments to establish their role in drug resistance. {I}mportantly, our results indicate that epistasis seems to be limited to the classical scheme where primary {DRM}s confer resistance and associated mutations modulate the strength of the resistance and/or compensate for the fitness cost induced by {DRM}s. {D}rug resistance mutations ({DRM}s) appear in {HIV} under treatment pressure. {DRM}s are commonly transmitted to naive patients. {T}he standard approach to reveal new {DRM}s is to test for significant frequency differences of mutations between treated and naive patients. {H}owever, we then consider each mutation individually and cannot hope to study interactions between several mutations. {H}ere, we aim to leverage the ever-growing quantity of high-quality sequence data and machine learning methods to study such interactions (i.e. epistasis), as well as try to find new {DRM}s. {W}e trained classifiers to discriminate between {R}everse {T}ranscriptase {I}nhibitor ({RTI})-experienced and {RTI}-naive samples on a large {HIV}-1 reverse transcriptase ({RT}) sequence dataset from the {UK} (n approximate to 55, 000), using all observed mutations as binary representation features. {T}o assess the robustness of our findings, our classifiers were evaluated on independent data sets, both from the {UK} and {A}frica. {I}mportant representation features for each classifier were then extracted as potential {DRM}s. {T}o find novel {DRM}s, we repeated this process by removing either features or samples associated to known {DRM}s. {W}hen keeping all known resistance signal, we detected sufficiently prevalent known {DRM}s, thus validating the approach. {W}hen removing features corresponding to known {DRM}s, our classifiers retained some prediction accuracy, and six new mutations significantly associated with resistance were identified. {T}hese six mutations have a low genetic barrier, are correlated to known {DRM}s, and are spatially close to either the {RT} active site or the regulatory binding pocket. {W}hen removing both known {DRM} features and sequences containing at least one known {DRM}, our classifiers lose all prediction accuracy. {T}hese results likely indicate that all mutations directly conferring resistance have been found, and that our newly discovered {DRM}s are accessory or compensatory mutations. {M}oreover, apart from the accessory nature of the relationships we found, we did not find any significant signal of further, more subtle epistasis combining several mutations which individually do not seem to confer any resistance.}, keywords = {{CAMEROUN} ; {REPUBLIQUE} {DEMOCRATIQUE} {DU} {CONGO} ; {BURUNDI} ; {BURKINA} {FASO} ; {TOGO} ; {ROYAUME} {UNI}}, booktitle = {}, journal = {{PL}o{S} {C}omputational {B}iology}, volume = {17}, numero = {8}, pages = {[21 p.]}, ISSN = {1553-734{X}}, year = {2021}, DOI = {10.1371/journal.pcbi.1008873}, URL = {https://www.documentation.ird.fr/hor/fdi:010083146}, }