@article{371e0b1fa7994eb2963e133b5e993994,

title = "Uniform-in-phase-space data selection with iterative normalizing flows",

abstract = "Improvements in computational and experimental capabilities are rapidly increasing the amount of scientific data that are routinely generated. In applications that are constrained by memory and computational intensity, excessively large datasets may hinder scientific discovery, making data reduction a critical component of data-driven methods. Datasets are growing in two directions: the number of data points and their dimensionality. Whereas dimension reduction typically aims at describing each data sample on lower-dimensional space, the focus here is on reducing the number of data points. A strategy is proposed to select data points such that they uniformly span the phase-space of the data. The algorithm proposed relies on estimating the probability map of the data and using it to construct an acceptance probability. An iterative method is used to accurately estimate the probability of the rare data points when only a small subset of the dataset is used to construct the probability map. Instead of binning the phase-space to estimate the probability map, its functional form is approximated with a normalizing flow. Therefore, the method naturally extends to high-dimensional datasets. The proposed framework is demonstrated as a viable pathway to enable data-efficient machine learning when abundant data are available.",

keywords = "Data reduction, instance selection, normalizing flows",

author = "Malik Hassanaly and Perry, {Bruce A.} and Mueller, {Michael E.} and Shashank Yellapantula",

note = "Funding Information: This work was authored by the National Renewable Energy Laboratory (NREL), operated by Alliance for Sustainable Energy, LLC, for the U.S. Department of Energy (DOE) under Contract No. DE-AC36-08GO28308. This work was supported by the U.S. Department of Energy Office of Energy Efficiency and Renewable Energy Vehicle Technologies Office (VTO). The research was performed using computational resources sponsored by the Department of Energy{\textquoteright}s Office of Energy Efficiency and Renewable Energy and located at the National Renewable Energy Laboratory. The views expressed in the article do not necessarily represent the views of the DOE or the U.S. Government. The U.S. Government retains and the publisher, by accepting the article for publication, acknowledges that the U.S. Government retains a nonexclusive, paid-up, irrevocable, worldwide license to publish or reproduce the published form of this work, or allow others to do so, for U.S. Government purposes. Funding Information: An implementation of the method along with tutorials is available in a companion repository ( https://github.com/NREL/Phase-space-sampling ) under BSD-3 license. A subsample of the 2D combustion dataset () is also available in the repository. Data were obtained from Buildings Data Hub funded by U.S. Department of Energy, Office of Energy Efficiency and Renewable Energy{\textquoteright}s Building Technologies Office operated and maintained by Pacific Northwest National Laboratory at https://bbd.labworks.org . Publisher Copyright: {\textcopyright} Alliance for Sustainable Energy, LLC, 2023. Published by Cambridge University Press.",

year = "2023",

month = apr,

day = "25",

doi = "10.1017/dce.2023.4",

language = "English (US)",

volume = "4",

journal = "Data-Centric Engineering",

issn = "2632-6736",

publisher = "Cambridge University Press",

number = "7",

}