@article{fdi:010093865, title = {{O}ryza{GP} 2021 update : a rice gene and protein dataset for named-entity recognition}, author = {{L}armande, {P}ierre and {L}iu, {Y}. and {Y}ao, {X}. and {X}ia, {J}.}, editor = {}, language = {{ENG}}, abstract = {{D}ue to the rapid evolution of high-throughput technologies, a tremendous amount of data is being produced in the biological domain, which poses a challenging task for information extraction and natural language understanding. {B}iological named entity recognition ({NER}) and named entity normalisation ({NEN}) are two common tasks aiming at identifying and linking biologically important entities such as genes or gene products mentioned in the literature to biological databases. {I}n this paper, we present an updated version of {O}ryza{GP}, a gene and protein dataset for rice species created to help natural language processing ({NLP}) tools in processing {NER} and {NEN} tasks. {T}o create the dataset, we selected more than 15,000 abstracts associated with articles previously curated for rice genes. {W}e developed four dictionaries of gene and protein names associated with database identifiers. {W}e used these dictionaries to annotate the dataset. {W}e also annotated the dataset using pre-trained {NLP} models. {F}inally, we analysed the annotation results and discussed how to improve {O}ryza{GP}.}, keywords = {}, booktitle = {}, journal = {{G}enomics and {I}nformatics}, volume = {19}, numero = {3}, pages = {e27 [4 ]}, ISSN = {2234-0742}, year = {2021}, DOI = {10.5808/gi.21015}, URL = {https://www.documentation.ird.fr/hor/fdi:010093865}, }