@article{fdi:010079425, title = {{M}easuring performance metrics of machine learning algorithms for detecting and classifying transposable elements}, author = {{O}rozco-{A}rias, {S}. and {P}ina, {J}. {S}. and {T}abares-{S}oto, {R}. and {C}astillo-{O}ssa, {L}. {F}. and {G}uyot, {R}omain and {I}saza, {G}.}, editor = {}, language = {{ENG}}, abstract = {{B}ecause of the promising results obtained by machine learning ({ML}) approaches in several fields, every day is more common, the utilization of {ML} to solve problems in bioinformatics. {I}n genomics, a current issue is to detect and classify transposable elements ({TE}s) because of the tedious tasks involved in bioinformatics methods. {T}hus, {ML} was recently evaluated for {TE} datasets, demonstrating better results than bioinformatics applications. {A} crucial step for {ML} approaches is the selection of metrics that measure the realistic performance of algorithms. {E}ach metric has specific characteristics and measures properties that may be different from the predicted results. {A}lthough the most commonly used way to compare measures is by using empirical analysis, a non-result-based methodology has been proposed, called measure invariance properties. {T}hese properties are calculated on the basis of whether a given measure changes its value under certain modifications in the confusion matrix, giving comparative parameters independent of the datasets. {M}easure invariance properties make metrics more or less informative, particularly on unbalanced, monomodal, or multimodal negative class datasets and for real or simulated datasets. {A}lthough several studies applied {ML} to detect and classify {TE}s, there are no works evaluating performance metrics in {TE} tasks. {H}ere, we analyzed 26 different metrics utilized in binary, multiclass, and hierarchical classifications, through bibliographic sources, and their invariance properties. {T}hen, we corroborated our findings utilizing freely available {TE} datasets and commonly used {ML} algorithms. {B}ased on our analysis, the most suitable metrics for {TE} tasks must be stable, even using highly unbalanced datasets, multimodal negative class, and training datasets with errors or outliers. {B}ased on these parameters, we conclude that the {F}1-score and the area under the precision-recall curve are the most informative metrics since they are calculated based on other metrics, providing insight into the development of an {ML} application.}, keywords = {transposable elements ; metrics ; machine learning ; deep learning ; detection ; classification}, booktitle = {}, journal = {{P}rocesses}, volume = {8}, numero = {6}, pages = {art. 638 [18p.]}, year = {2020}, DOI = {10.3390/pr8060638}, URL = {https://www.documentation.ird.fr/hor/fdi:010079425}, }