@article{McPhee_Richetti_Croke_Walmsley_2024, title={Model evaluation: The misuse of statistical techniques when evaluating observations versus predictions}, volume={6}, url={https://sesmo.org/article/view/18758}, DOI={10.18174/sesmo.18758}, abstractNote={Mathematical modellers, decision support developers, statisticians, and students evaluate the differences between observed and model predicted values. When evaluating models, it is far too easy to conduct model evaluation by fitting a linear regression to the data. In this paper, steps are presented on ‘how to’ evaluate a model using deviance metrics rather than reporting r2 from fitting a linear regression. The paper aims to provide sound reasoning, with data, against using r2. The paper addresses five arguments, previously put forward, for not fitting a linear regression when conducting model evaluation: i) Misapplication of regression; ii) Ambiguity of null hypothesis tests; iii) Lack of sensitivity; iv) Fitted line is irrelevant to validation; and v) Violation of regression assumptions. Statistical, deviance, and quality control metrics are outlined. Three models using the BeefSpecs drafting tool are reported in this paper. Each model (n = 80) had an r2 of 0.43. A mean bias of 0.06, -2.90, and -0.11 mm, and a root mean square error of prediction (RMSEP) of 1.72, 3.37, and 3.70 mm for models 1, 2, and 3, respectively. A modelling efficiency (MEF) of 0.39, -1.34, and -1.83, and 91, 51, and 56% of predictions within upper and lower quality control limits for models 1, 2, and 3, respectively. These metrics highlight the pitfall of reporting r2 from using regression. Minimum recommended steps of ‘how to’ conduct model evaluation are: a plot of the residuals with quality control limits and a table of metrics including mean observed, predicted and bias, RMSEP, and MEF.}, journal={Socio-Environmental Systems Modelling}, author={McPhee, Malcolm and Richetti, Jonathan and Croke, Barry and Walmsley, Brad}, year={2024}, month={Sep.}, pages={18758} }