@article{fdi:010095450, title = {{N}earest neighbor versus regression approach: effect of performance measures, calibration set size, and sampling method on soil organic carbon prediction using {VNIR} lab spectroscopy}, author = {{T}ernikar, {C}. {R}. and {G}omez, {C}{\'e}cile and {D}utta, {D}. and {K}umar, {D}. {N}.}, editor = {}, language = {{ENG}}, abstract = {{S}oil organic carbon ({SOC}) plays a critical role in soil health, agricultural productivity, and ecosystem functioning, making accurate {SOC} estimations essential for sustainable land management and climate change mitigation. {V}isible and near-infrared spectroscopy has emerged as a promising, nondestructive, and cost-effective method for {SOC} estimation. {T}his study evaluates the performance of nine nearest neighbor ({NN}) models and the partial least squares regression ({PLSR}) model to estimate {SOC} using the global open soil spectral library data. {D}etailed error analyses and the use of mean absolute error ({MAE}) as performance metric revealed differences in model performance that traditional metrics like {R}-2, {RMSE}, and ratio of performance to deviation alone fail to capture. {E}rror correlation analysis further indicated that o_plsd (optimized partial least squares distance, one of the {NN} models) and {PLSR} provide structurally independent insights, while certain pairs of {NN} models (pcad - plsd and o_plsd - o_pcad) yield redundant information. {A}mong the ten models tested, o_plsd model outperformed {PLSR} by leveraging local data density, exhibiting lower {MAE} (1.79% versus 2.36%) but was more sensitive to reduction in calibration set size. {I}n contrast, {PLSR} demonstrated better generalizability with less sensitivity to calibration size variation, but relatively higher sensitivity to the choice of sampling method. {F}uture research should focus on strategies to improve computational efficiency of {NN} models. {T}he findings highlight the importance of performance metric selection and calibration strategy in large-scale {SOC} modeling. {T}hese results have practical implications for improving {SOC} prediction models and designing efficient hybrid approaches for large, heterogeneous soil datasets.}, keywords = {{S}oil measurements ; {C}alibration ; {A}nalytical models ; {P}redictive models ; {S}pectroscopy ; {C}omputational modeling ; {B}iological system modeling ; {D}ata models ; {O}rganic materials ; {C}limate change ; {I}nfrared imaging ; {E}rror correction ; {N}earest neighbor methods ; {L}east squares approximations ; {E}rror correlation analysis ; nearest neighbor ({NN}) models ; open soil ; spectral library ; partial least squares regression ({PLSR}) ; soil organic carbon ({SOC}) ; structural independence ; visible and near-infrared ({VNIR}) ; lab spectroscopy}, booktitle = {}, journal = {{IEEE} {J}ournal of {S}elected {T}opics in {A}pplied {E}arth {O}bservations and {R}emote {S}ensing}, volume = {18}, numero = {}, pages = {25583--25604}, ISSN = {1939-1404}, year = {2025}, DOI = {10.1109/jstars.2025.3615516}, URL = {https://www.documentation.ird.fr/hor/fdi:010095450}, }