2024.bib

@article{Fekete2023,
  author = {Fekete, Imre and Molnár, András and Simon, Péter L.},
  journal = {Results in Mathematics},
  title = {A Functional Approach to Interpreting the Role of the Adjoint Equation in Machine Learning},
  year = {2023},
  issn = {1420-9012},
  month = dec,
  note = {Seminar 2024/2025},
  number = {1},
  volume = {79},
  abstract = {The connection between numerical methods for solving differen- tial equations and machine learning has been revealed recently. Differen- tial equations have been proposed as continuous analogues of deep neural networks, and then used in handling certain tasks, such as image recog- nition, where the training of a model includes learning the parameters of systems of ODEs from certain points along their trajectories. Treat- ing this inverse problem of determining the parameters of a dynamical system that minimize the difference between data and trajectory by a gradient-based optimization method presents the solution of the adjoint equation as the continuous analogue of backpropagation that yields the appropriate gradients. The paper explores an abstract approach that can be used to construct a family of loss functions with the aim of fitting the solution of an initial value problem to a set of discrete or continu- ous measurements. It is shown, that an extension of the adjoint equation can be used to derive the gradient of the loss function as a continuous analogue of backpropagation in machine learning. Numerical evidence is presented that under reasonably controlled circumstances the gradients obtained this way can be used in a gradient descent to fit the solution of an initial value problem to a set of continuous noisy measurements, and a set of discrete noisy measurements that are recorded at uncertain times.},
  doi = {10.1007/s00025-023-02074-3},
  publisher = {Springer Science and Business Media LLC}
}
@misc{Mueller2023,
  author = {Müller, Johannes and Zeinhofer, Marius},
  note = {Seminar 2024/2025},
  title = {Achieving High Accuracy with PINNs via Energy Natural Gradients},
  year = {2023},
  abstract = {We propose energy natural gradient descent, a natural gradient method with respect to a Hessian-induced Riemannian metric as an optimization algorithm for physics-informed neural networks (PINNs) and the deep Ritz method. As a main motivation we show that the update direction in function space resulting from the energy natural gradient corresponds to the Newton direction modulo an orthogonal projection onto the model's tangent space. We demonstrate experimentally that energy natural gradient descent yields highly accurate solutions with errors several orders of magnitude smaller than what is obtained when training PINNs with standard optimizers like gradient descent or Adam, even when those are allowed significantly more computation time.},
  copyright = {arXiv.org perpetual, non-exclusive license},
  doi = {10.48550/ARXIV.2302.13163},
  keywords = {Machine Learning (cs.LG), Numerical Analysis (math.NA), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Mathematics, FOS: Mathematics},
  publisher = {arXiv}
}
@misc{Haeusner2023,
  author = {Häusner, Paul and Öktem, Ozan and Sjölund, Jens},
  note = {Seminar 2024/2025},
  title = {Neural incomplete factorization: learning preconditioners for the conjugate gradient method},
  year = {2023},
  abstract = {Finding suitable preconditioners to accelerate iterative solution methods, such as the conjugate gradient method, is an active area of research. In this paper, we develop a computationally efficient data-driven approach to replace the typically hand-engineered algorithms with neural networks. Optimizing the condition number of the linear system directly is computationally infeasible. Instead, our method generates an incomplete factorization of the matrix and is, therefore, referred to as neural incomplete factorization (NeuralIF). For efficient training, we utilize a stochastic approximation of the Frobenius loss which only requires matrix-vector multiplications. At the core of our method is a novel messagepassing block, inspired by sparse matrix theory, that aligns with the objective of finding a sparse factorization of the matrix. By replacing conventional preconditioners used within the conjugate gradient method by data-driven models based on graph neural networks, we accelerate the iterative solving procedure. We evaluate our proposed method on both a synthetic and a real-world problem arising from scientific computing and show its ability to reduce the solving time while remaining computationally efficient.},
  copyright = {arXiv.org perpetual, non-exclusive license},
  doi = {10.48550/ARXIV.2305.16368},
  keywords = {Optimization and Control (math.OC), Machine Learning (cs.LG), Numerical Analysis (math.NA), Machine Learning (stat.ML), FOS: Mathematics, FOS: Mathematics, FOS: Computer and information sciences, FOS: Computer and information sciences},
  publisher = {arXiv}
}
@misc{Rackow2024_WeatherFuture,
  archiveprefix = {arXiv},
  author = {Thomas Rackow and Nikolay Koldunov and Christian Lessig and Irina Sandu and Mihai Alexe and Matthew Chantry and Mariana Clare and Jesper Dramsch and Florian Pappenberger and Xabier Pedruzo-Bagazgoitia and Steffen Tietsche and Thomas Jung},
  date-added = {2024-10-01 16:47:04 +0200},
  date-modified = {2024-10-01 16:47:15 +0200},
  eprint = {2409.18529},
  primaryclass = {physics.ao-ph},
  title = {Robustness of AI-based weather forecasts in a changing climate},
  url = {https://arxiv.org/abs/2409.18529},
  year = {2024},
  bdsk-url-1 = {https://arxiv.org/abs/2409.18529}
}
@misc{Bodnar2024,
  archiveprefix = {arXiv},
  author = {Cristian Bodnar and Wessel P. Bruinsma and Ana Lucic and Megan Stanley and Johannes Brandstetter and Patrick Garvan and Maik Riechert and Jonathan Weyn and Haiyu Dong and Anna Vaughan and Jayesh K. Gupta and Kit Tambiratnam and Alex Archibald and Elizabeth Heider and Max Welling and Richard E. Turner and Paris Perdikaris},
  date-added = {2024-06-19 12:29:19 +0200},
  date-modified = {2024-06-19 12:29:27 +0200},
  eprint = {2405.13063},
  primaryclass = {id='physics.ao-ph' full_name='Atmospheric and Oceanic Physics' is_active=True alt_name='ao-sci' in_archive='physics' is_general=False description='Atmospheric and oceanic physics and physical chemistry, biogeophysics, and climate science'},
  title = {Aurora: A Foundation Model of the Atmosphere},
  year = {2024}
}
@misc{Nguyen2023_Stormer,
  archiveprefix = {arXiv},
  author = {Tung Nguyen and Rohan Shah and Hritik Bansal and Troy Arcomano and Sandeep Madireddy and Romit Maulik and Veerabhadra Kotamarthi and Ian Foster and Aditya Grover},
  date-added = {2024-09-24 14:16:42 +0200},
  date-modified = {2024-09-24 14:16:51 +0200},
  eprint = {2312.03876},
  primaryclass = {physics.ao-ph},
  title = {Scaling transformer neural networks for skillful and reliable medium-range weather forecasting},
  url = {https://arxiv.org/abs/2312.03876},
  year = {2023},
  bdsk-url-1 = {https://arxiv.org/abs/2312.03876}
}
@article{Hakim2023,
  address = {Boston MA, USA},
  author = {Gregory J. Hakim and Sanjit Masanam},
  date-added = {2024-08-20 11:29:05 +0200},
  date-modified = {2024-08-20 11:29:11 +0200},
  doi = {10.1175/AIES-D-23-0090.1},
  journal = {Artificial Intelligence for the Earth Systems},
  number = {3},
  pages = {e230090},
  publisher = {American Meteorological Society},
  title = {Dynamical Tests of a Deep Learning Weather Prediction Model},
  url = {https://journals.ametsoc.org/view/journals/aies/3/3/AIES-D-23-0090.1.xml},
  volume = {3},
  year = {2024},
  bdsk-url-1 = {https://journals.ametsoc.org/view/journals/aies/3/3/AIES-D-23-0090.1.xml},
  bdsk-url-2 = {https://doi.org/10.1175/AIES-D-23-0090.1}
}
@misc{Vaughan2024,
  archiveprefix = {arXiv},
  author = {Anna Vaughan and Stratis Markou and Will Tebbutt and James Requeima and Wessel P. Bruinsma and Tom R. Andersson and Michael Herzog and Nicholas D. Lane and Matthew Chantry and J. Scott Hosking and Richard E. Turner},
  date-added = {2024-08-12 19:17:54 +0200},
  date-modified = {2024-08-12 19:18:03 +0200},
  eprint = {2404.00411},
  primaryclass = {physics.ao-ph},
  title = {Aardvark weather: end-to-end data-driven weather forecasting},
  url = {https://arxiv.org/abs/2404.00411},
  year = {2024},
  bdsk-url-1 = {https://arxiv.org/abs/2404.00411}
}
@misc{Vandal2024_zeus,
  archiveprefix = {arXiv},
  author = {Thomas J. Vandal and Kate Duffy and Daniel McDuff and Yoni Nachmany and Chris Hartshorn},
  date-added = {2024-08-10 17:28:17 +0200},
  date-modified = {2024-08-10 17:28:24 +0200},
  eprint = {2407.11696},
  primaryclass = {cs.LG},
  title = {Global atmospheric data assimilation with multi-modal masked autoencoders},
  url = {https://arxiv.org/abs/2407.11696},
  year = {2024},
  bdsk-url-1 = {https://arxiv.org/abs/2407.11696}
}
@article{Kochkov2024,
  abstract = {General circulation models (GCMs) are the foundation of weather and climate prediction1,2. GCMs are physics-based simulators that combine a numerical solver for large-scale dynamics with tuned representations for small-scale processes such as cloud formation. Recently, machine-learning models trained on reanalysis data have achieved comparable or better skill than GCMs for deterministic weather forecasting3,4. However, these models have not demonstrated improved ensemble forecasts, or shown sufficient stability for long-term weather and climate simulations. Here we present a GCM that combines a differentiable solver for atmospheric dynamics with machine-learning components and show that it can generate forecasts of deterministic weather, ensemble weather and climate on par with the best machine-learning and physics-based methods. NeuralGCM is competitive with machine-learning models for one- to ten-day forecasts, and with the European Centre for Medium-Range Weather Forecasts ensemble prediction for one- to fifteen-day forecasts. With prescribed sea surface temperature, NeuralGCM can accurately track climate metrics for multiple decades, and climate forecasts with 140-kilometre resolution show emergent phenomena such as realistic frequency and trajectories of tropical cyclones. For both weather and climate, our approach offers orders of magnitude computational savings over conventional GCMs, although our model does not extrapolate to substantially different future climates. Our results show that end-to-end deep learning is compatible with tasks performed by conventional GCMs and can enhance the large-scale physical simulations that are essential for understanding and predicting the Earth system.},
  author = {Kochkov, Dmitrii and Yuval, Janni and Langmore, Ian and Norgaard, Peter and Smith, Jamie and Mooers, Griffin and Kl{\"o}wer, Milan and Lottes, James and Rasp, Stephan and D{\"u}ben, Peter and Hatfield, Sam and Battaglia, Peter and Sanchez-Gonzalez, Alvaro and Willson, Matthew and Brenner, Michael P. and Hoyer, Stephan},
  date = {2024/07/22},
  date-added = {2024-07-24 13:03:06 +0200},
  date-modified = {2024-07-24 13:03:14 +0200},
  doi = {10.1038/s41586-024-07744-y},
  id = {Kochkov2024},
  isbn = {1476-4687},
  journal = {Nature},
  title = {Neural general circulation models for weather and climate},
  url = {https://doi.org/10.1038/s41586-024-07744-y},
  year = {2024},
  bdsk-url-1 = {https://doi.org/10.1038/s41586-024-07744-y}
}
@misc{Vonich2024,
  archiveprefix = {arXiv},
  author = {P. Trent Vonich and Gregory J. Hakim},
  date-added = {2024-07-13 12:58:40 +0200},
  date-modified = {2024-07-13 12:58:47 +0200},
  eprint = {2406.05019},
  primaryclass = {physics.ao-ph},
  title = {Predictability Limit of the 2021 Pacific Northwest Heatwave from Deep-Learning Sensitivity Analysis},
  url = {https://arxiv.org/abs/2406.05019},
  year = {2024},
  bdsk-url-1 = {https://arxiv.org/abs/2406.05019}
}
@article{Kovachki2021,
  author = {Kovachki, Nikola and Lanthaler, Samuel and Mishra, Siddhartha},
  title = {On universal approximation and error bounds for Fourier Neural Operators},
  year = {2021},
  abstract = {Fourier neural operators (FNOs) have recently been proposed as an effective framework for learning operators that map between infinite-dimensional spaces. We prove that FNOs are universal, in the sense that they can approximate any continuous operator to desired accuracy. Moreover, we suggest a mechanism by which FNOs can approximate operators associated with PDEs efficiently. Explicit error bounds are derived to show that the size of the FNO, approximating operators associated with a Darcy type elliptic PDE and with the incompressible Navier-Stokes equations of fluid dynamics, only increases sub (log)-linearly in terms of the reciprocal of the error. Thus, FNOs are shown to efficiently approximate operators arising in a large class of PDEs.},
  bdsk-url-1 = {https://arxiv.org/abs/2107.07562},
  bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2107.07562},
  copyright = {Creative Commons Attribution 4.0 International},
  date-added = {2022-08-13 19:24:05 +0200},
  date-modified = {2022-08-13 19:24:16 +0200},
  doi = {10.48550/ARXIV.2107.07562},
  keywords = {Numerical Analysis (math.NA), FOS: Mathematics, FOS: Mathematics},
  publisher = {arXiv},
  url = {https://arxiv.org/abs/2107.07562}
}
@misc{li2022superresolution,
  author = {Yang Li and Haiyu Dong and Zuliang Fang and Jonathan Weyn and Pete Luferenko},
  title = {Super-resolution Probabilistic Rain Prediction from Satellite Data Using 3D U-Nets and EarthFormers},
  year = {2022},
  abstract = {Accurate and timely rain prediction is crucial for decision making and is also a challenging task. This paper presents a solution which won the 2 nd prize in the Weather4cast 2022 NeurIPS competition using 3D U-Nets and EarthFormers for 8-hour probabilistic rain prediction based on multi-band satellite images. The spatial context effect of the input satellite image has been deeply explored and optimal context range has been found. Based on the imbalanced rain distribution, we trained multiple models with different loss functions. To further improve the model performance, multi-model ensemble and threshold optimization were used to produce the final probabilistic rain prediction. Experiment results and leaderboard scores demonstrate that optimal spatial context, combined loss function, multi-model ensemble, and threshold optimization all provide modest model gain. A permutation test was used to analyze the effect of each satellite band on rain prediction, and results show that satellite bands signifying cloudtop phase (8.7 um) and cloud-top height (10.8 and 13.4 um) are the best predictors for rain prediction. The source code is available at this https URL.},
  archiveprefix = {arXiv},
  date-added = {2023-08-01 20:29:57 +0200},
  date-modified = {2023-08-01 20:29:57 +0200},
  eprint = {2212.02998},
  primaryclass = {cs.CV}
}
@article{Bi:2023aa,
  abstract = {Weather forecasting is important for science and society. At present, the most accurate forecast system is the numerical weather prediction (NWP) method, which represents atmospheric states as discretized grids and numerically solves partial differential equations that describe the transition between those states1. However, this procedure is computationally expensive. Recently, artificial-intelligence-based methods2 have shown potential in accelerating weather forecasting by orders of magnitude, but the forecast accuracy is still significantly lower than that of NWP methods. Here we introduce an artificial-intelligence-based method for accurate, medium-range global weather forecasting. We show that three-dimensional deep networks equipped with Earth-specific priors are effective at dealing with complex patterns in weather data, and that a hierarchical temporal aggregation strategy reduces accumulation errors in medium-range forecasting. Trained on 39 years of global data, our program, Pangu-Weather, obtains stronger deterministic forecast results on reanalysis data in all tested variables when compared with the world's best NWP system, the operational integrated forecasting system of the European Centre for Medium-Range Weather Forecasts (ECMWF)3. Our method also works well with extreme weather forecasts and ensemble forecasts. When initialized with reanalysis data, the accuracy of tracking tropical cyclones is also higher than that of ECMWF-HRES.},
  author = {Bi, Kaifeng and Xie, Lingxi and Zhang, Hengheng and Chen, Xin and Gu, Xiaotao and Tian, Qi},
  date = {2023/07/05},
  date-added = {2023-07-13 11:08:09 +0200},
  date-modified = {2023-07-13 11:08:09 +0200},
  doi = {10.1038/s41586-023-06185-3},
  id = {Bi2023},
  isbn = {1476-4687},
  journal = {Nature},
  title = {Accurate medium-range global weather forecasting with 3D neural networks},
  url = {https://doi.org/10.1038/s41586-023-06185-3},
  year = {2023},
  bdsk-url-1 = {https://doi.org/10.1038/s41586-023-06185-3}
}
@article{Lin2022,
  abstract = {Artificial intelligence has the potential to open insight into the structure of proteins at the scale of evolution. It has only recently been possible to extend protein structure prediction to two hundred million cataloged proteins. Characterizing the structures of the exponentially growing billions of protein sequences revealed by large scale gene sequencing experiments would necessitate a breakthrough in the speed of folding. Here we show that direct inference of structure from primary sequence using a large language model enables an order of magnitude speed-up in high resolution structure prediction. Leveraging the insight that language models learn evolutionary patterns across millions of sequences, we train models up to 15B parameters, the largest language model of proteins to date. As the language models are scaled they learn information that enables prediction of the three-dimensional structure of a protein at the resolution of individual atoms. This results in prediction that is up to 60x faster than state-of-the-art while maintaining resolution and accuracy. Building on this, we present the ESM Metagenomic Atlas. This is the first large-scale structural characterization of metagenomic proteins, with more than 617 million structures. The atlas reveals more than 225 million high confidence predictions, including millions whose structures are novel in comparison with experimentally determined structures, giving an unprecedented view into the vast breadth and diversity of the structures of some of the least understood proteins on earth.Competing Interest StatementThe authors have declared no competing interest.},
  author = {Lin, Zeming and Akin, Halil and Rao, Roshan and Hie, Brian and Zhu, Zhongkai and Lu, Wenting and Smetanin, Nikita and Verkuil, Robert and Kabeli, Ori and Shmueli, Yaniv and dos Santos Costa, Allan and Fazel-Zarandi, Maryam and Sercu, Tom and Candido, Salvatore and Rives, Alexander},
  date-added = {2022-11-11 09:42:27 +0100},
  date-modified = {2022-11-11 09:43:09 +0100},
  doi = {10.1101/2022.07.20.500902},
  elocation-id = {2022.07.20.500902},
  eprint = {https://www.biorxiv.org/content/early/2022/10/31/2022.07.20.500902.full.pdf},
  journal = {bioRxiv},
  publisher = {Cold Spring Harbor Laboratory},
  title = {Evolutionary-scale prediction of atomic level protein structure with a language model},
  url = {https://www.biorxiv.org/content/early/2022/10/31/2022.07.20.500902},
  year = {2022},
  bdsk-url-1 = {https://www.biorxiv.org/content/early/2022/10/31/2022.07.20.500902},
  bdsk-url-2 = {https://doi.org/10.1101/2022.07.20.500902}
}
@misc{mardani2023generative,
  author = {Morteza Mardani and Noah Brenowitz and Yair Cohen and Jaideep Pathak and Chieh-Yu Chen and Cheng-Chin Liu and Arash Vahdat and Karthik Kashinath and Jan Kautz and Mike Pritchard},
  title = {Generative Residual Diffusion Modeling for Km-scale Atmospheric Downscaling},
  year = {2023},
  abstract = {The state of the art for physical hazard prediction from weather and climate requires expensive km-scale numerical simulations driven by coarser resolution global inputs. Here, a km-scale downscaling diffusion model is presented as a cost effective alternative. The model is trained from a regional high-resolution weather model over Taiwan, and conditioned on ERA5 reanalysis data. To address the downscaling uncertainties, large resolution ratios (25km to 2km), different physics involved at different scales and predict channels that are not in the input data, we employ a two-step approach (\textit{ResDiff}) where a (UNet) regression predicts the mean in the first step and a diffusion model predicts the residual in the second step. \textit{ResDiff} exhibits encouraging skill in bulk RMSE and CRPS scores. The predicted spectra and distributions from ResDiff faithfully recover important power law relationships regulating damaging wind and rain extremes. Case studies of coherent weather phenomena reveal appropriate multivariate relationships reminiscent of learnt physics. This includes the sharp wind and temperature variations that co-locate with intense rainfall in a cold front, and the extreme winds and rainfall bands that surround the eyewall of typhoons. Some evidence of simultaneous bias correction is found. A first attempt at downscaling directly from an operational global forecast model successfully retains many of these benefits. The implication is that a new era of fully end-to-end, global-to-regional machine learning weather prediction is likely near at hand},
  archiveprefix = {arXiv},
  eprint = {2309.15214},
  primaryclass = {cs.LG}
}
@misc{Hutchinson2020,
  author = {Hutchinson, Michael and Lan, Charline Le and Zaidi, Sheheryar and Dupont, Emilien and Teh, Yee Whye and Kim, Hyunjik},
  title = {LieTransformer: Equivariant self-attention for Lie Groups},
  year = {2020},
  abstract = {Group equivariant neural networks are used as building blocks of group invariant neural networks, which have been shown to improve generalisation performance and data efficiency through principled parameter sharing. Such works have mostly focused on group equivariant convolutions, building on the result that group equivariant linear maps are necessarily convolutions. In this work, we extend the scope of the literature to self-attention, that is emerging as a prominent building block of deep learning models. We propose the LieTransformer, an architecture composed of LieSelfAttention layers that are equivariant to arbitrary Lie groups and their discrete subgroups. We demonstrate the generality of our approach by showing experimental results that are competitive to baseline methods on a wide range of tasks: shape counting on point clouds, molecular property regression and modelling particle trajectories under Hamiltonian dynamics},
  bdsk-url-1 = {https://arxiv.org/abs/2012.10885},
  bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2012.10885},
  copyright = {arXiv.org perpetual, non-exclusive license},
  date-added = {2022-08-13 09:57:27 +0200},
  date-modified = {2022-08-13 09:57:34 +0200},
  doi = {10.48550/ARXIV.2012.10885},
  keywords = {Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
  publisher = {arXiv},
  url = {https://arxiv.org/abs/2012.10885}
}
@article{DeVore2021,
  author = {Ronald DeVore and Boris Hanin and Guergana Petrova},
  journal = {Acta Numerica},
  title = {Neural network approximation},
  year = {2021},
  month = {may},
  pages = {327--444},
  volume = {30},
  abstract = {Neural networks (NNs) are the method of choice for building learning algorithms. They are now being investigated for other numerical tasks such as solving high-dimensional partial differential equations. Their popularity stems from their empirical success on several challenging learning problems (computer chess/Go, autonomous navigation, face recognition). However, most scholars agree that a convincing theoretical explanation for this success is still lacking. Since these applications revolve around approximating an unknown function from data observations, part of the answer must involve the ability of NNs to produce accurate approximations.

This article surveys the known approximation properties of the outputs of NNs with the aim of uncovering the properties that are not present in the more traditional methods of approximation used in numerical analysis, such as approximations using polynomials, wavelets, rational functions and splines. Comparisons are made with traditional approximation methods from the viewpoint of rate distortion, i.e. error versus the number of parameters used to create the approximant. Another major component in the analysis of numerical approximation is the computational time needed to construct the approximation, and this in turn is intimately connected with the stability of the approximation algorithm. So the stability of numerical approximation using NNs is a large part of the analysis put forward.

The survey, for the most part, is concerned with NNs using the popular ReLU activation function. In this case the outputs of the NNs are piecewise linear functions on rather complicated partitions of the domain of f into cells that are convex polytopes. When the architecture of the NN is fixed and the parameters are allowed to vary, the set of output functions of the NN is a parametrized nonlinear manifold. It is shown that this manifold has certain space-filling properties leading to an increased ability to approximate (better rate distortion) but at the expense of numerical stability. The space filling creates the challenge to the numerical method of finding best or good parameter choices when trying to approximate},
  doi = {10.1017/s0962492921000052},
  publisher = {Cambridge University Press ({CUP})}
}
@article{Scaman2018,
  author = {Scaman, Kevin and Virmaux, Aladin},
  title = {Lipschitz regularity of deep neural networks: analysis and efficient estimation},
  year = {2018},
  month = may,
  abstract = {Deep neural networks are notorious for being sensitive to small well-chosen perturbations, and estimating the regularity of such architectures is of utmost importance for safe and robust practical applications. In this paper, we investigate one of the key characteristics to assess the regularity of such methods: the Lipschitz constant of deep learning architectures. First, we show that, even for two layer neural networks, the exact computation of this quantity is NP-hard and state-of-art methods may significantly overestimate it. Then, we both extend and improve previous estimation methods by providing AutoLip, the first generic algorithm for upper bounding the Lipschitz constant of any automatically differentiable function. We provide a power method algorithm working with automatic differentiation, allowing efficient computations even on large convolutions. Second, for sequential neural networks, we propose an improved algorithm named SeqLip that takes advantage of the linear computation graph to split the computation per pair of consecutive layers. Third we propose heuristics on SeqLip in order to tackle very large networks. Our experiments show that SeqLip can significantly improve on the existing upper bounds. Finally, we provide an implementation of AutoLip in the PyTorch environment that may be used to better estimate the robustness of a given neural network to small perturbations or regularize it using more precise Lipschitz estimations.},
  archiveprefix = {arXiv},
  copyright = {arXiv.org perpetual, non-exclusive license},
  doi = {10.48550/ARXIV.1805.10965},
  eprint = {1805.10965},
  file = {:http\://arxiv.org/pdf/1805.10965v2:PDF},
  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences},
  primaryclass = {stat.ML},
  publisher = {arXiv}
}
@article{Tanyu2023,
  author = {Derick Nganyu Tanyu and Jianfeng Ning and Tom Freudenberg and Nick Heilenkötter and Andreas Rademacher and Uwe Iben and Peter Maass},
  journal = {Inverse Problems},
  title = {Deep learning methods for partial differential equations and related parameter identification problems},
  year = {2023},
  month = {aug},
  number = {10},
  pages = {103001},
  volume = {39},
  abstract = {Recent years have witnessed a growth in mathematics for deep learning—which seeks a deeper understanding of the concepts of deep learning with mathematics and explores how to make it more robust—and deep learning for mathematics, where deep learning algorithms are used to solve problems in mathematics. The latter has popularised the field of scientific machine learning where deep learning is applied to problems in scientific computing. Specifically, more and more neural network (NN) architectures have been developed to solve specific classes of partial differential equations (PDEs). Such methods exploit properties that are inherent to PDEs and thus solve the PDEs better than standard feed-forward NNs, recurrent NNs, or convolutional neural networks. This has had a great impact in the area of mathematical modelling where parametric PDEs are widely used to model most natural and physical processes arising in science and engineering. In this work, we review such methods as well as their extensions for parametric studies and for solving the related inverse problems. We also show their relevance in various industrial applications.},
  doi = {10.1088/1361-6420/ace9d4},
  publisher = {{IOP} Publishing}
}
@article{Mueller2021,
  author = {Müller, Johannes and Zeinhofer, Marius},
  title = {Error Estimates for the Deep Ritz Method with Boundary Penalty},
  year = {2021},
  month = mar,
  abstract = {We estimate the error of the Deep Ritz Method for linear elliptic equations. For Dirichlet boundary conditions, we estimate the error when the boundary values are imposed through the boundary penalty method. Our results apply to arbitrary sets of ansatz functions and estimate the error in dependence of the optimization accuracy, the approximation capabilities of the ansatz class and -- in the case of Dirichlet boundary values -- the penalization strength $\lambda$. To the best of our knowledge, our results are presently the only ones in the literature that treat the case of Dirichlet boundary conditions in full generality, i.e., without a lower order term that leads to coercivity on all of $H^1(\Omega)$. Further, we discuss the implications of our results for ansatz classes which are given through ReLU networks and the relation to existing estimates for finite element functions. For high dimensional problems our results show that the favourable approximation capabilities of neural networks for smooth functions are inherited by the Deep Ritz Method.},
  archiveprefix = {arXiv},
  copyright = {arXiv.org perpetual, non-exclusive license},
  doi = {10.48550/ARXIV.2103.01007},
  eprint = {2103.01007},
  file = {:http\://arxiv.org/pdf/2103.01007v4:PDF},
  keywords = {Numerical Analysis (math.NA), Machine Learning (cs.LG), FOS: Mathematics, FOS: Computer and information sciences, 65N15, 68T07},
  primaryclass = {math.NA},
  publisher = {arXiv}
}
@comment{{jabref-meta: databaseType:bibtex;}}

This file was generated by bibtex2html 1.99.