@article{fdi:010094502, title = {{A} transformer-based {N}lp pipeline for enhanced extraction of botanical information using {C}amembert on french literature}, author = {{N}ainia, {A}. and {V}ignes-{L}ebbe, {R}. and {C}henin, {E}ric and {S}ahraoui, {M}. and {M}ousannif, {H}. and {Z}ahir, {J}.}, editor = {}, language = {{ENG}}, abstract = {{T}his research investigates the untapped wealth of centuries-old {F}rench botanical literature, particularly focused on floras, which are comprehensive guides detailing plant species in specific regions. {D}espite their significance, this literature remains largely unexplored in the context of {AI} integration. {O}ur objective is to bridge this gap by constructing a specialized botanical {F}rench dataset sourced from the flora of {N}ew {C}aledonia. {W}e propose a transformer-based {N}amed {E}ntity {R}ecognition pipeline, leveraging distant supervision and {C}amem{BERT}, for the automated extraction and structuring of botanical information. {T}he results demonstrate exceptional performance: for species names extraction, the {NER} model achieves precision (0.94), recall (0.98), and {F}1-score (0.96), while for fine-grained extraction of botanical morphological terms, the {C}amem{BERT}-based {NER} model attains precision (0.93), recall (0.96), and {F}1-score (0.94). {T}his work contributes to the exploration of valuable botanical literature by underscoring the capability of {AI} models to automate information extraction from complex and diverse texts.}, keywords = {}, booktitle = {}, journal = {{NLP} and {I}nformation {R}etrieval}, volume = {14}, numero = {6}, pages = {59--78}, ISSN = {2079-9292}, year = {2024}, DOI = {10.5121/csit.2024.140605}, URL = {https://www.documentation.ird.fr/hor/fdi:010094502}, }