@article{fdi:010084042,
  title = {{O}ryza{GP} : rice gene and protein dataset for named-entity recognition},
  author = {{L}armande, {P}ierre and {H}uy {D}o and {Y}ue {W}ang},
  editor = {},
  language = {{ENG}},
  abstract = {{T}ext mining has become an important research method in biology, with its original purpose to extract biological entities, such as genes, proteins and phenotypic traits, to extend knowledge from scientific papers. {H}owever, few thorough studies on text mining and application development, for plant molecular biology data, have been performed, especially for rice, resulting in a lack of datasets available to solve named-entity recognition tasks for this species. {S}ince there are rare benchmarks available for rice, we faced various difficulties in exploiting advanced machine learning methods for accurate analysis of the rice literature. {T}o evaluate several approaches to automatically extract information from gene/protein entities, we built a new dataset for rice as a benchmark. {T}his dataset is composed of a set of titles and abstracts, extracted from scientific papers focusing on the rice species, and is downloaded from {P}ub{M}ed. {D}uring the 5th {B}iomedical {L}inked {A}nnotation {H}ackathon, a portion of the dataset was uploaded to {P}ub{A}nnotation for sharing. {O}ur ultimate goal is to offer a shared task of rice gene/protein name recognition through the {B}io{NLP} {O}pen {S}hared {T}asks framework using the dataset, to facilitate an open comparison and evaluation of different approaches to the task.},
  keywords = {{RIZ} ; {CONSERVATION} {DES} {RESSOURCES} {GENETIQUES} ; {BIOLOGIE} {MOLECULAIRE} ; {LANGAGE} {DE} {PROGRAMMATION} ; {FOUILLE} {DE} {TEXTE}},
  booktitle = {},
  journal = {{G}enomics and {I}nformatics},
  volume = {17},
  numero = {2},
  pages = {e17 [3 ]},
  ISSN = {2234-0742},
  year = {2019},
  DOI = {10.5808/{GI}.2019.17.2.e17},
  URL = {https://www.documentation.ird.fr/hor/fdi:010084042},
}