@phdthesis{fdi:010093221, title = {{L}inking complementary datasets through the augmentation of knowledge graphs and multimodal representations}, author = {{H}appi {H}appi, {B}ill {G}ates}, editor = {}, language = {{ENG}}, abstract = {{A}t the end of the 20th century, the rise of the {I}nternet enabled the creation of the web, a network of interconnected machines that exchange data in the form of documents. {T}hese documents allow humans to communicate and preserve information across generations. {I}n the early 2000s, the concept of the {S}emantic {W}eb emerged to enable machines to better understand and process these data. {M}odels such as {RDF} ({R}esource {D}escription {F}ramework) were developed to represent information in the form of triples: subject, predicate, and object. {W}ith the explosion of data published on the web, several challenges have arisen, particularly regarding the management of descriptions of the same entity coming from various sources. {T}he {W}orld {W}ide {W}eb {C}onsortium ({W}3{C}) formalized knowledge graphs, networks of annotated nodes and links, to structure and interlink these data. {T}his thesis aims to improve the linking of {RDF} graphs to identify similar entities or instances converging toward the same reality, relying on the owl predicate. {E}ntity or instance alignment, a rapidly growing field in the scientific community, seeks to address challenges related to data diversity, including linguistic and semantic variations. {T}he goal is to integrate and make differently structured data interoperable. {A}lthough several tools exist, they remain limited by multi-level challenges, such as efficiently reducing the number of entity pairs to compare and analyzing literal values. {L}inguistic and contextual differences add an additional layer of complexity, requiring techniques capable of handling these variations. {T}his field still presents opportunities for developing more sophisticated solutions, incorporating machine learning and semantic analysis techniques. {I}n this thesis, we propose several contributions, ranging from specialized methods for simple datasets to the design of a general entity alignment model. {A}ware of the limitations of knowledge graphs, we propose an augmentation approach, starting with named entity recognition in literals. {W}e developed {GRU}-{SCANET}, a new architecture that enhances accuracy and reduces word vector pre-training time. {GRU}-{SCANET} outperforms the state of the art on eight biological datasets. {W}e then evaluated the approach using {S}pa{C}y due to its broad entity detection capabilities. {A}dditionally, we designed {DL}inker, which reduces the entity candidate pair comparison space. {D}uring the {OAEI} 2022 and {PFIA}/{AFIA} 2023 competitions, {DL}inker demonstrated its efficiency by reducing processing time to 1.6 seconds. {H}owever, when dealing with more detailed graphs, we developed {GL}inker, which applies graph embedding techniques to improve performance. {GL}inker, combined with a new similarity measure called {HPP}, achieved better performance compared to the {J}aro-{W}inkler method. {D}espite these improvements, limitations remain in handling multilingual data and synonyms. {T}o address these issues, we conducted a comparative study on classifiers influenced by different embedding techniques to propose a more general alignment model, {LLM}4{EA}. {T}his model leverages the power of language models such as {GPT}-2 and {BERT} to improve entity alignment, overcoming linguistic and contextual challenges. {I}n conclusion, this thesis presents various solutions for entity alignment, paving the way for future research.}, keywords = {}, address = {{M}ontpellier ({FRA}) ; {M}arseille}, publisher = {{U}niversit{\'e} de {M}ontpellier ; {IRD}}, pages = {211 multigr.}, year = {2024}, URL = {https://www.documentation.ird.fr/hor/fdi:010093221}, }