2023.bib

@inproceedings{Li2021NeuralFourier,
  author = {Z. Li and N. B. Kovachki and K. Azizzadenesheli and B. Liu and K. Bhattacharya and A. Stuart and A. Anandkumar},
  booktitle = {International Conference on Learning Representations},
  title = {Fourier Neural Operator for Parametric Partial Differential Equations},
  year = {2021},
  abstract = {The classical development of neural networks has primarily focused on learning mappings between finite-dimensional Euclidean spaces.  Recently, this has been generalized to neural operators that learn mappings between function spaces. For partial differential equations (PDEs), neural operators directly learn the mapping from any functional parametric dependence to the solution. Thus, they learn an entire family of PDEs, in contrast to classical methods which solve one instance of the equation. In this work, we formulate a new neural operator by parameterizing the integral kernel directly in Fourier space, allowing for an expressive and efficient architecture. We perform experiments on Burgers' equation, Darcy flow, and Navier-Stokes equation. The Fourier neural operator is the first ML-based method to successfully model turbulent flows with zero-shot super-resolution. It is up to three orders of magnitude faster compared to traditional PDE solvers. Additionally, it achieves superior accuracy compared to previous learning-based solvers under fixed resolution.},
  bdsk-url-1 = {https://openreview.net/forum?id=c8P9NQVtmnO},
  date-added = {2022-02-25 14:50:38 +0100},
  date-modified = {2022-02-25 14:51:12 +0100},
  url = {https://openreview.net/forum?id=c8P9NQVtmnO}
}

@article{Kovachki2021,
  author = {Kovachki, Nikola and Lanthaler, Samuel and Mishra, Siddhartha},
  title = {On universal approximation and error bounds for Fourier Neural Operators},
  year = {2021},
  abstract = {Fourier neural operators (FNOs) have recently been proposed as an effective framework for learning operators that map between infinite-dimensional spaces. We prove that FNOs are universal, in the sense that they can approximate any continuous operator to desired accuracy. Moreover, we suggest a mechanism by which FNOs can approximate operators associated with PDEs efficiently. Explicit error bounds are derived to show that the size of the FNO, approximating operators associated with a Darcy type elliptic PDE and with the incompressible Navier-Stokes equations of fluid dynamics, only increases sub (log)-linearly in terms of the reciprocal of the error. Thus, FNOs are shown to efficiently approximate operators arising in a large class of PDEs.},
  bdsk-url-1 = {https://arxiv.org/abs/2107.07562},
  bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2107.07562},
  copyright = {Creative Commons Attribution 4.0 International},
  date-added = {2022-08-13 19:24:05 +0200},
  date-modified = {2022-08-13 19:24:16 +0200},
  doi = {10.48550/ARXIV.2107.07562},
  keywords = {Numerical Analysis (math.NA), FOS: Mathematics, FOS: Mathematics},
  publisher = {arXiv},
  url = {https://arxiv.org/abs/2107.07562}
}

@misc{Lam2022,
  author = {Lam, Remi and Sanchez-Gonzalez, Alvaro and Willson, Matthew and Wirnsberger, Peter and Fortunato, Meire and Pritzel, Alexander and Ravuri, Suman and Ewalds, Timo and Alet, Ferran and Eaton-Rosen, Zach and Hu, Weihua and Merose, Alexander and Hoyer, Stephan and Holland, George and Stott, Jacklynn and Vinyals, Oriol and Mohamed, Shakir and Battaglia, Peter},
  title = {GraphCast: Learning skillful medium-range global weather forecasting},
  year = {2022},
  abstract = {Global medium-range weather forecasting is critical to decision-making across many social and economic domains. Traditional numerical weather prediction uses increased compute resources to improve forecast accuracy, but cannot directly use historical weather data to improve the underlying model. We introduce a machine learning-based method called "GraphCast", which can be trained directly from reanalysis data. It predicts hundreds of weather variables, over 10 days at 0.25 degree resolution globally, in under one minute. We show that GraphCast significantly outperforms the most accurate operational deterministic systems on 90% of 1380 verification targets, and its forecasts support better severe event prediction, including tropical cyclones, atmospheric rivers, and extreme temperatures. GraphCast is a key advance in accurate and efficient weather forecasting, and helps realize the promise of machine learning for modeling complex dynamical systems},
  bdsk-url-1 = {https://arxiv.org/abs/2212.12794},
  bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2212.12794},
  copyright = {Creative Commons Attribution 4.0 International},
  date-added = {2023-01-02 11:17:42 +0100},
  date-modified = {2023-01-02 11:17:48 +0100},
  doi = {10.48550/ARXIV.2212.12794},
  keywords = {Machine Learning (cs.LG), Atmospheric and Oceanic Physics (physics.ao-ph), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Physical sciences, FOS: Physical sciences},
  publisher = {arXiv},
  url = {https://arxiv.org/abs/2212.12794}
}

@misc{li2022superresolution,
  author = {Yang Li and Haiyu Dong and Zuliang Fang and Jonathan Weyn and Pete Luferenko},
  title = {Super-resolution Probabilistic Rain Prediction from Satellite Data Using 3D U-Nets and EarthFormers},
  year = {2022},
  abstract = {Accurate and timely rain prediction is crucial for decision making and is also a challenging task. This paper presents a solution which won the 2 nd prize in the Weather4cast 2022 NeurIPS competition using 3D U-Nets and EarthFormers for 8-hour probabilistic rain prediction based on multi-band satellite images. The spatial context effect of the input satellite image has been deeply explored and optimal context range has been found. Based on the imbalanced rain distribution, we trained multiple models with different loss functions. To further improve the model performance, multi-model ensemble and threshold optimization were used to produce the final probabilistic rain prediction. Experiment results and leaderboard scores demonstrate that optimal spatial context, combined loss function, multi-model ensemble, and threshold optimization all provide modest model gain. A permutation test was used to analyze the effect of each satellite band on rain prediction, and results show that satellite bands signifying cloudtop phase (8.7 um) and cloud-top height (10.8 and 13.4 um) are the best predictors for rain prediction. The source code is available at this https URL.},
  archiveprefix = {arXiv},
  date-added = {2023-08-01 20:29:57 +0200},
  date-modified = {2023-08-01 20:29:57 +0200},
  eprint = {2212.02998},
  primaryclass = {cs.CV}
}

@article{Bi:2023aa,
  abstract = {Weather forecasting is important for science and society. At present, the most accurate forecast system is the numerical weather prediction (NWP) method, which represents atmospheric states as discretized grids and numerically solves partial differential equations that describe the transition between those states1. However, this procedure is computationally expensive. Recently, artificial-intelligence-based methods2 have shown potential in accelerating weather forecasting by orders of magnitude, but the forecast accuracy is still significantly lower than that of NWP methods. Here we introduce an artificial-intelligence-based method for accurate, medium-range global weather forecasting. We show that three-dimensional deep networks equipped with Earth-specific priors are effective at dealing with complex patterns in weather data, and that a hierarchical temporal aggregation strategy reduces accumulation errors in medium-range forecasting. Trained on 39 years of global data, our program, Pangu-Weather, obtains stronger deterministic forecast results on reanalysis data in all tested variables when compared with the world's best NWP system, the operational integrated forecasting system of the European Centre for Medium-Range Weather Forecasts (ECMWF)3. Our method also works well with extreme weather forecasts and ensemble forecasts. When initialized with reanalysis data, the accuracy of tracking tropical cyclones is also higher than that of ECMWF-HRES.},
  author = {Bi, Kaifeng and Xie, Lingxi and Zhang, Hengheng and Chen, Xin and Gu, Xiaotao and Tian, Qi},
  date = {2023/07/05},
  date-added = {2023-07-13 11:08:09 +0200},
  date-modified = {2023-07-13 11:08:09 +0200},
  doi = {10.1038/s41586-023-06185-3},
  id = {Bi2023},
  isbn = {1476-4687},
  journal = {Nature},
  title = {Accurate medium-range global weather forecasting with 3D neural networks},
  url = {https://doi.org/10.1038/s41586-023-06185-3},
  year = {2023},
  bdsk-url-1 = {https://doi.org/10.1038/s41586-023-06185-3}
}

@article{Lin2022,
  abstract = {Artificial intelligence has the potential to open insight into the structure of proteins at the scale of evolution. It has only recently been possible to extend protein structure prediction to two hundred million cataloged proteins. Characterizing the structures of the exponentially growing billions of protein sequences revealed by large scale gene sequencing experiments would necessitate a breakthrough in the speed of folding. Here we show that direct inference of structure from primary sequence using a large language model enables an order of magnitude speed-up in high resolution structure prediction. Leveraging the insight that language models learn evolutionary patterns across millions of sequences, we train models up to 15B parameters, the largest language model of proteins to date. As the language models are scaled they learn information that enables prediction of the three-dimensional structure of a protein at the resolution of individual atoms. This results in prediction that is up to 60x faster than state-of-the-art while maintaining resolution and accuracy. Building on this, we present the ESM Metagenomic Atlas. This is the first large-scale structural characterization of metagenomic proteins, with more than 617 million structures. The atlas reveals more than 225 million high confidence predictions, including millions whose structures are novel in comparison with experimentally determined structures, giving an unprecedented view into the vast breadth and diversity of the structures of some of the least understood proteins on earth.Competing Interest StatementThe authors have declared no competing interest.},
  author = {Lin, Zeming and Akin, Halil and Rao, Roshan and Hie, Brian and Zhu, Zhongkai and Lu, Wenting and Smetanin, Nikita and Verkuil, Robert and Kabeli, Ori and Shmueli, Yaniv and dos Santos Costa, Allan and Fazel-Zarandi, Maryam and Sercu, Tom and Candido, Salvatore and Rives, Alexander},
  date-added = {2022-11-11 09:42:27 +0100},
  date-modified = {2022-11-11 09:43:09 +0100},
  doi = {10.1101/2022.07.20.500902},
  elocation-id = {2022.07.20.500902},
  eprint = {https://www.biorxiv.org/content/early/2022/10/31/2022.07.20.500902.full.pdf},
  journal = {bioRxiv},
  publisher = {Cold Spring Harbor Laboratory},
  title = {Evolutionary-scale prediction of atomic level protein structure with a language model},
  url = {https://www.biorxiv.org/content/early/2022/10/31/2022.07.20.500902},
  year = {2022},
  bdsk-url-1 = {https://www.biorxiv.org/content/early/2022/10/31/2022.07.20.500902},
  bdsk-url-2 = {https://doi.org/10.1101/2022.07.20.500902}
}

@misc{mardani2023generative,
  author = {Morteza Mardani and Noah Brenowitz and Yair Cohen and Jaideep Pathak and Chieh-Yu Chen and Cheng-Chin Liu and Arash Vahdat and Karthik Kashinath and Jan Kautz and Mike Pritchard},
  title = {Generative Residual Diffusion Modeling for Km-scale Atmospheric Downscaling},
  year = {2023},
  abstract = {The state of the art for physical hazard prediction from weather and climate requires expensive km-scale numerical simulations driven by coarser resolution global inputs. Here, a km-scale downscaling diffusion model is presented as a cost effective alternative. The model is trained from a regional high-resolution weather model over Taiwan, and conditioned on ERA5 reanalysis data. To address the downscaling uncertainties, large resolution ratios (25km to 2km), different physics involved at different scales and predict channels that are not in the input data, we employ a two-step approach (\textit{ResDiff}) where a (UNet) regression predicts the mean in the first step and a diffusion model predicts the residual in the second step. \textit{ResDiff} exhibits encouraging skill in bulk RMSE and CRPS scores. The predicted spectra and distributions from ResDiff faithfully recover important power law relationships regulating damaging wind and rain extremes. Case studies of coherent weather phenomena reveal appropriate multivariate relationships reminiscent of learnt physics. This includes the sharp wind and temperature variations that co-locate with intense rainfall in a cold front, and the extreme winds and rainfall bands that surround the eyewall of typhoons. Some evidence of simultaneous bias correction is found. A first attempt at downscaling directly from an operational global forecast model successfully retains many of these benefits. The implication is that a new era of fully end-to-end, global-to-regional machine learning weather prediction is likely near at hand},
  archiveprefix = {arXiv},
  eprint = {2309.15214},
  primaryclass = {cs.LG}
}

@misc{Hutchinson2020,
  author = {Hutchinson, Michael and Lan, Charline Le and Zaidi, Sheheryar and Dupont, Emilien and Teh, Yee Whye and Kim, Hyunjik},
  title = {LieTransformer: Equivariant self-attention for Lie Groups},
  year = {2020},
  abstract = {Group equivariant neural networks are used as building blocks of group invariant neural networks, which have been shown to improve generalisation performance and data efficiency through principled parameter sharing. Such works have mostly focused on group equivariant convolutions, building on the result that group equivariant linear maps are necessarily convolutions. In this work, we extend the scope of the literature to self-attention, that is emerging as a prominent building block of deep learning models. We propose the LieTransformer, an architecture composed of LieSelfAttention layers that are equivariant to arbitrary Lie groups and their discrete subgroups. We demonstrate the generality of our approach by showing experimental results that are competitive to baseline methods on a wide range of tasks: shape counting on point clouds, molecular property regression and modelling particle trajectories under Hamiltonian dynamics},
  bdsk-url-1 = {https://arxiv.org/abs/2012.10885},
  bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2012.10885},
  copyright = {arXiv.org perpetual, non-exclusive license},
  date-added = {2022-08-13 09:57:27 +0200},
  date-modified = {2022-08-13 09:57:34 +0200},
  doi = {10.48550/ARXIV.2012.10885},
  keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  publisher = {arXiv},
  url = {https://arxiv.org/abs/2012.10885}
}

@article{DeVore2021,
  author = {Ronald DeVore and Boris Hanin and Guergana Petrova},
  journal = {Acta Numerica},
  title = {Neural network approximation},
  year = {2021},
  month = {may},
  pages = {327--444},
  volume = {30},
  abstract = {Neural networks (NNs) are the method of choice for building learning algorithms. They are now being investigated for other numerical tasks such as solving high-dimensional partial differential equations. Their popularity stems from their empirical success on several challenging learning problems (computer chess/Go, autonomous navigation, face recognition). However, most scholars agree that a convincing theoretical explanation for this success is still lacking. Since these applications revolve around approximating an unknown function from data observations, part of the answer must involve the ability of NNs to produce accurate approximations.

This article surveys the known approximation properties of the outputs of NNs with the aim of uncovering the properties that are not present in the more traditional methods of approximation used in numerical analysis, such as approximations using polynomials, wavelets, rational functions and splines. Comparisons are made with traditional approximation methods from the viewpoint of rate distortion, i.e. error versus the number of parameters used to create the approximant. Another major component in the analysis of numerical approximation is the computational time needed to construct the approximation, and this in turn is intimately connected with the stability of the approximation algorithm. So the stability of numerical approximation using NNs is a large part of the analysis put forward.

The survey, for the most part, is concerned with NNs using the popular ReLU activation function. In this case the outputs of the NNs are piecewise linear functions on rather complicated partitions of the domain of f into cells that are convex polytopes. When the architecture of the NN is fixed and the parameters are allowed to vary, the set of output functions of the NN is a parametrized nonlinear manifold. It is shown that this manifold has certain space-filling properties leading to an increased ability to approximate (better rate distortion) but at the expense of numerical stability. The space filling creates the challenge to the numerical method of finding best or good parameter choices when trying to approximate},
  doi = {10.1017/s0962492921000052},
  publisher = {Cambridge University Press ({CUP})}
}

@article{Scaman2018,
  author = {Scaman, Kevin and Virmaux, Aladin},
  title = {Lipschitz regularity of deep neural networks: analysis and efficient estimation},
  year = {2018},
  month = may,
  abstract = {Deep neural networks are notorious for being sensitive to small well-chosen perturbations, and estimating the regularity of such architectures is of utmost importance for safe and robust practical applications. In this paper, we investigate one of the key characteristics to assess the regularity of such methods: the Lipschitz constant of deep learning architectures. First, we show that, even for two layer neural networks, the exact computation of this quantity is NP-hard and state-of-art methods may significantly overestimate it. Then, we both extend and improve previous estimation methods by providing AutoLip, the first generic algorithm for upper bounding the Lipschitz constant of any automatically differentiable function. We provide a power method algorithm working with automatic differentiation, allowing efficient computations even on large convolutions. Second, for sequential neural networks, we propose an improved algorithm named SeqLip that takes advantage of the linear computation graph to split the computation per pair of consecutive layers. Third we propose heuristics on SeqLip in order to tackle very large networks. Our experiments show that SeqLip can significantly improve on the existing upper bounds. Finally, we provide an implementation of AutoLip in the PyTorch environment that may be used to better estimate the robustness of a given neural network to small perturbations or regularize it using more precise Lipschitz estimations.},
  archiveprefix = {arXiv},
  copyright = {arXiv.org perpetual, non-exclusive license},
  doi = {10.48550/ARXIV.1805.10965},
  eprint = {1805.10965},
  file = {:http\://arxiv.org/pdf/1805.10965v2:PDF},
  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences},
  primaryclass = {stat.ML},
  publisher = {arXiv}
}

@article{Tanyu2023,
  author = {Derick Nganyu Tanyu and Jianfeng Ning and Tom Freudenberg and Nick Heilenkötter and Andreas Rademacher and Uwe Iben and Peter Maass},
  journal = {Inverse Problems},
  title = {Deep learning methods for partial differential equations and related parameter identification problems},
  year = {2023},
  month = {aug},
  number = {10},
  pages = {103001},
  volume = {39},
  abstract = {Recent years have witnessed a growth in mathematics for deep learning—which seeks a deeper understanding of the concepts of deep learning with mathematics and explores how to make it more robust—and deep learning for mathematics, where deep learning algorithms are used to solve problems in mathematics. The latter has popularised the field of scientific machine learning where deep learning is applied to problems in scientific computing. Specifically, more and more neural network (NN) architectures have been developed to solve specific classes of partial differential equations (PDEs). Such methods exploit properties that are inherent to PDEs and thus solve the PDEs better than standard feed-forward NNs, recurrent NNs, or convolutional neural networks. This has had a great impact in the area of mathematical modelling where parametric PDEs are widely used to model most natural and physical processes arising in science and engineering. In this work, we review such methods as well as their extensions for parametric studies and for solving the related inverse problems. We also show their relevance in various industrial applications.},
  doi = {10.1088/1361-6420/ace9d4},
  publisher = {{IOP} Publishing}
}

@article{Mueller2021,
  author = {Müller, Johannes and Zeinhofer, Marius},
  title = {Error Estimates for the Deep Ritz Method with Boundary Penalty},
  year = {2021},
  month = mar,
  abstract = {We estimate the error of the Deep Ritz Method for linear elliptic equations. For Dirichlet boundary conditions, we estimate the error when the boundary values are imposed through the boundary penalty method. Our results apply to arbitrary sets of ansatz functions and estimate the error in dependence of the optimization accuracy, the approximation capabilities of the ansatz class and -- in the case of Dirichlet boundary values -- the penalization strength $\lambda$. To the best of our knowledge, our results are presently the only ones in the literature that treat the case of Dirichlet boundary conditions in full generality, i.e., without a lower order term that leads to coercivity on all of $H^1(\Omega)$. Further, we discuss the implications of our results for ansatz classes which are given through ReLU networks and the relation to existing estimates for finite element functions. For high dimensional problems our results show that the favourable approximation capabilities of neural networks for smooth functions are inherited by the Deep Ritz Method.},
  archiveprefix = {arXiv},
  copyright = {arXiv.org perpetual, non-exclusive license},
  doi = {10.48550/ARXIV.2103.01007},
  eprint = {2103.01007},
  file = {:http\://arxiv.org/pdf/2103.01007v4:PDF},
  keywords = {Numerical Analysis (math.NA), Machine Learning (cs.LG), FOS: Mathematics, FOS: Computer and information sciences, 65N15, 68T07},
  primaryclass = {math.NA},
  publisher = {arXiv}
}

@article{Barron1993,
  author = {A.R. Barron},
  journal = {IEEE Transactions on Information Theory},
  title = {Universal Approximation Bounds for Superpositions of a Sigmoidal Function},
  year = {1993},
  number = {3},
  pages = {930-945},
  volume = {39},
  abstract = {Approximation properties of a class of artificial neural networks are established. It is shown that feedforward networks with one layer of sigmoidal nonlinearities achieve inte grated squared error of order O(l/n), where n is the number of nodes. The function appruximated is assumed to have a bound on the first moment of the magnitude distribution of the Fourier transform. The nonlinear parameters associated with the sigmoidal nodes, as well as the parameters of linear combination, are adjusted in the approximation. In contrast, it is shown that for series expansions with n terms, in which only the parameters of linear combination are adjusted, the integrated squared approximation error cannot be made smaller than order 1/n2/d uniformly for functions satisfying the same smoothness assumption, where d is the dimension of the input to the function. For the class of functions examined here, the approximation rate and the parsimony of the parameterization of the networks are surprisingly advantageous in high-dimensional settings}
}

@article{Kapustsin2023,
  author = {V. Kapustsin and U. Kaya and T. Richter},
  journal = {submitted},
  title = {Error analysis for hybrid finite element / neural network discretizations},
  year = {2023},
  abstract = {We describe and analyze a hybrid finite element / neural network method for predicting solutions of partial differential equations. The methodology is designed for obtaining fine scale fluctuations from neural networks in a local manner. The network is capable of locally correcting a coarse finite element solution towards a fine solution taking the source term and the coarse approximation as input. Key observation is the dependency between quality of predictions and the size of training set which consists of different source terms and corresponding fine & coarse solutions. We provide the a priori error analysis of the method together with the stability analysis of the neural network. The numerical experiments confirm the capability of the network predicting fine finite element solutions. We also illustrate the generalization of the method to problems where test and training domains differ from each other.}
}

@comment{{jabref-meta: databaseType:bibtex;}}

This file was generated by bibtex2html 1.99.