@article{fdi:010091277, title = {{F}lora{NER} : a new dataset for species and morphological terms named entity recognition in {F}rench botanical text}, author = {{N}ainia, {A}. and {V}ignes-{L}ebbe, {R}. and {C}henin, {E}ric and {S}ahraoui, {M}. and {M}ousannif, {H}. and {Z}ahir, {J}.}, editor = {}, language = {{ENG}}, abstract = {{F}lora{NER} is a distantly supervised named entity recognition dataset ({NER}). {T}he dataset is built from botanical {F}rench literature extracted from the {OCR}-preprocessed flora of {N}ew {C}aledonia, provided by the {N}ational {M}useum of {N}atural {H}istory in {F}rance ({MNHN}), and distantly annotated with a botanical {F}rench corpus created by merging botanical lexicons available online. {F}lora{NER} comprises separate subdatasets for the recognition of plant species names, as well as coarse-grained and fine-grained botanical morphological terms. {T}he resulting datasets are in {CSV} format, displaying textual data, identified named entities, and their annotations, covering one named entity type "{S}pecies" ({E}sp & egrave;ce in {F}rench) for species name identification, two named entity types "{O}rgan" and "{D}escriptor" for coarse-grained morphological term identification, and eight named entity types for fine-grained morphological term identification: {O}rgan, {D}escriptor, {F}orm, {C}olor, {D}evelopment, {S}tructure, {S}urface, {P}osition, {D}isposition, and {M}easure. {T}his dataset can be utilized to train and evaluate named entity recognition models for extracting information from botanical {F}rench literature.}, keywords = {{NER} {D}ataset ; {B}iodiversity dataset ; {S}pecies identification dataset ; {P}lant morphology dataset ; {NOUVELLE} {CALEDONIE}}, booktitle = {}, journal = {{D}ata in {B}rief}, volume = {56}, numero = {}, pages = {110824 [10 p.]}, ISSN = {2352-3409}, year = {2024}, DOI = {10.1016/j.dib.2024.110824}, URL = {https://www.documentation.ird.fr/hor/fdi:010091277}, }