@article{fdi:010093865,
  title = {{O}ryza{GP} 2021 update : a rice gene and protein dataset for named-entity recognition},
  author = {{L}armande, {P}ierre and {L}iu, {Y}. and {Y}ao, {X}. and {X}ia, {J}.},
  editor = {},
  language = {{ENG}},
  abstract = {{D}ue to the rapid evolution of high-throughput technologies, a tremendous amount of data is being produced in the biological domain, which poses a challenging task for information extraction and natural language understanding. {B}iological named entity recognition ({NER}) and named entity normalisation ({NEN}) are two common tasks aiming at identifying and linking biologically important entities such as genes or gene products mentioned in the literature to biological databases. {I}n this paper, we present an updated version of {O}ryza{GP}, a gene and protein dataset for rice species created to help natural language processing ({NLP}) tools in processing {NER} and {NEN} tasks. {T}o create the dataset, we selected more than 15,000 abstracts associated with articles previously curated for rice genes. {W}e developed four dictionaries of gene and protein names associated with database identifiers. {W}e used these dictionaries to annotate the dataset. {W}e also annotated the dataset using pre-trained {NLP} models. {F}inally, we analysed the annotation results and discussed how to improve {O}ryza{GP}.},
  keywords = {},
  booktitle = {},
  journal = {{G}enomics and {I}nformatics},
  volume = {19},
  numero = {3},
  pages = {e27 [4 ]},
  ISSN = {2234-0742},
  year = {2021},
  DOI = {10.5808/gi.21015},
  URL = {https://www.documentation.ird.fr/hor/fdi:010093865},
}