@techreport{CRNGB13,
author = {Guillaume Claret and Sriram K. Rajamani and Aditya V. Nori and Andrew D. Gordon and Johannes Borgstr\"om},
title = {Bayesian Inference Using Data Flow Analysis},
number = {MSR-TR-2013-27},
institution = {Microsoft Research},
month = {March},
year = {2013},
url = {http://research.microsoft.com/apps/pubs/default.aspx?id=171611},
abstract = {We present a new algorithm for Bayesian inference over probabilistic programs, based on data flow analysis techniques from the program analysis community. Unlike existing techniques for Bayesian inference on probabilistic programs, our data flow analysis algorithm is able to perform inference directly on probabilistic programs with loops. Even for loop-free programs, we show that data flow analysis offers better precision and better performance benefits over existing techniques. We also describe heuristics that are crucial for our inference to scale, and present an empirical evaluation of our algorithm over a range of benchmarks.}
}

@techreport{GABCGNRR13,
author = {Andrew D. Gordon and Mihhail Aizatulin and Johannes Borgstr\"om and Guillaume Claret and Thore Graepel and Aditya V. Nori and Sriram K. Rajamani and Claudio Russo},
title = {A Model-Learner Pattern for {B}ayesian Reasoning},
month = {January},
year = {2013},
number = {MSR-TR-2013-1},
institution = {Microsoft Research},
abstract = {A Bayesian model is based on a pair of probability distributions, known as the prior and sampling distributions. A wide range of fundamental machine learning tasks, including regression, classification, clustering, and many others, can all be seen as Bayesian models. We propose a new probabilistic programming abstraction, a typed Bayesian model, based on a pair of probabilistic expressions for the prior and sampling distributions. A sampler for a model is an algorithm to compute synthetic data from its sampling distribution, while a learner for a model is an algorithm for probabilistic inference on the model. Models, samplers, and learners form a generic programming pattern for model-based inference. They support the uniform expression of common tasks including model testing, and generic compositions such as mixture models, evidence-based model averaging, and mixtures of experts. A formal semantics supports reasoning about model equivalence and implementation correctness. By developing a series of examples and three learner implementations based on exact inference, factor graphs, and Markov chain Monte Carlo, we demonstrate the broad applicability of this new programming pattern.},
url = {http://research.microsoft.com/apps/pubs/default.aspx?id=173887}
}

@inproceedings{BBGR13,
author = {Sooraj Bhat and Johannes Borgstr\"om and Andrew D. Gordon and Claudio Russo},
title = {Deriving Probability Density Functions from Probabilistic Functional Programs},
booktitle = {19th Int. Conf. on Tools and Algorithms for the Construction and Analysis of Systems (TACAS)},
publisher = {Springer},
year = {2013},
note = {EAPLS Best Paper Award for ETAPS 2013},
url = {http://research.microsoft.com/apps/pubs/default.aspx?id=189021},
abstract = {The probability density function of a probability distribution is a fundamental concept in probability theory and a key ingredient in various widely used machine learning methods. However, the necessary framework for compiling probabilistic functional programs to density functions has only recently been developed. In this work, we present a density compiler for a probabilistic language with discrete and continuous distributions, and discrete observations, and provide a proof of its soundness. The compiler greatly reduces the development effort of domain experts, which we demonstrate by solving inference problems from various scientific applications, such as modelling the global carbon cycle, using a standard Markov chain Monte Carlo framework.}
}

@inproceedings{Gib2013,
author = {Jeremy Gibbons},
title = {Unifying Theories of Programming with Monads},
booktitle = {Unifying Theories of Programming},
year = 2013,
publisher = {Springer},
series = {Lecture Notes in Computer Science},
volume = 7681,
pages = {23--67},
doi = {10.1007/978-3-642-35705-3_2},
abstract = {The combination of probabilistic and nondeterministic choice in program calculi is a notoriously tricky problem, and one with a long history. We present a simple functional programming approach to this challenge, based on algebraic theories of computational effects. We make use of the powerful abstraction facilities of modern functional languages, to introduce the choice operations as a little embedded domain-specific language rather than having to define a language extension; we rely on referential transparency, to justify straightforward equational reasoning about program behaviour.}
}

@incollection{FRT12,
author = {Cameron E. Freer and Daniel M. Roy and Joshua B. Tenenbaum},
archiveprefix = {arXiv},
booktitle = {Turing's {L}egacy},
series = {ASL Lecture Notes in Logic},
publisher = {Cambridge University Press},
eprint = {1212.4799},
primaryclass = {cs.AI},
editor = {Rod Downey},
title = {Towards common-sense reasoning via conditional simulation: legacies of {T}uring in {A}rtificial {I}ntelligence},
url = {http://danroy.org/papers/FreRoyTen-Turing.pdf},
year = {2012},
keywords = {probabilistic programming, Alan Turing, artificial intelligence},
abstract = {The problem of replicating the flexibility of human common-sense reasoning has captured the imagination of computer scientists since the early days of Alan Turing's foundational work on computation and the philosophy of artificial intelligence. In the intervening years, the idea of cognition as computation has emerged as a fundamental tenet of Artificial Intelligence (AI) and cognitive science. But what kind of computation is cognition?
We describe a computational formalism centered around a probabilistic Turing machine called QUERY, which captures the operation of probabilistic conditioning via conditional simulation. Through several examples and analyses, we demonstrate how the QUERY abstraction can be used to cast common-sense reasoning as probabilistic inference in a statistical model of our observations and the uncertain structure of the world that generated that experience. This formulation is a recent synthesis of several research programs in AI and cognitive science, but it also represents a surprising convergence of several of Turing's pioneering insights in AI, the foundations of computation, and statistics.}
}

@inproceedings{SG12,
author = {Andreas Stuhlm\"uller and Noah D. Goodman},
archiveprefix = {arXiv},
booktitle = {Second Statistical Relational AI workshop at UAI 2012 (StaRAI-12)},
eprint = {1206.3555},
primaryclass = {cs.AI},
title = {A Dynamic Programming Algorithm for Inference in Recursive Probabilistic Programs},
year = {2012},
keywords = {probabilistic programming, inference, dynamic programming},
abstract = {We describe a dynamic programming algorithm for computing the marginal distribution of discrete probabilistic programs. This algorithm takes a functional interpreter for an arbitrary probabilistic programming language and turns it into an efficient marginalizer. Because direct caching of sub-distributions is impossible in the presence of recursion, we build a graph of dependencies between sub-distributions. This factored sum-product network makes (potentially cyclic) dependencies between subproblems explicit, and corresponds to a system of equations for the marginal distribution. We solve these equations by fixed-point iteration in topological order. We illustrate this algorithm on examples used in teaching probabilistic models, computational cognitive science research, and game theory.}
}

@misc{AFR10,
author = {Nathanael L. Ackerman and Cameron E. Freer and Daniel M. Roy},
title = {On the computability of conditional probability},
year = {2010},
eprint = {1005.3014},
url = {http://danroy.org/papers/AckFreRoy-CompCondProb-preprint.pdf},
keywords = {computable probability theory, conditional probability},
abstract = {As inductive inference and machine learning methods in computer science see continued success, researchers are aiming to describe even more complex probabilistic models and inference algorithms. What are the limits of mechanizing probabilistic inference? We investigate the computability of conditional probability, a fundamental notion in probability theory and a cornerstone of Bayesian statistics, and show that there are computable joint distributions with noncomputable conditional distributions, ruling out the prospect of general inference algorithms, even inefficient ones. Specifically, we construct a pair of computable random variables in the unit interval such that the conditional distribution of the first variable given the second encodes the halting problem. Nevertheless, probabilistic inference is possible in many common modeling settings, and we prove several results giving broadly applicable conditions under which conditional distributions are computable. In particular, conditional distributions become computable when measurements are corrupted by independent computable noise with a sufficiently smooth density.}
}

@misc{FR09,
author = {Cameron E. Freer and Daniel M. Roy},
title = {Computable de~{F}inetti measures},
year = {2009},
eprint = {0912.1072},
url = {http://danroy.org/papers/FreerRoy-CompDeFinetti-preprint.pdf},
doi = {10.1016/j.apal.2011.06.011},
abstract = {We prove a computable version of de Finetti's theorem on exchangeable sequences of real random variables. As a consequence, exchangeable stochastic processes expressed in probabilistic functional programming languages can be automatically rewritten as procedures that do not modify non-local state. Along the way, we prove that a distribution on the unit interval is computable if and only if its moments are uniformly computable.}
}

@inproceedings{AFR11,
author = {Nathanael L. Ackerman and Cameron E. Freer and Daniel M. Roy},
title = {Noncomputable conditional distributions},
year = {2011},
booktitle = {Proc. of the 26th Ann. Symp. on Logic in Comp. Sci.},
publisher = {IEEE Press},
url = {http://danroy.org/papers/AckFreRoy-LICS-2011.pdf},
abstract = {We study the computability of conditional probability, a fundamental notion in probability theory and Bayesian statistics. In the elementary discrete setting, a ratio of probabilities defines conditional probability. In more general settings, conditional probability is defined axiomatically, and the search for more constructive definitions is the subject of a rich literature in probability theory and statistics. However, we show that in general one cannot compute conditional probabilities. Specifically, we construct a pair of computable random variables (X, Y) in the unit interval whose conditional distribution P[Y|X] encodes the halting problem.
Nevertheless, probabilistic inference has proven remarkably successful in practice, even in infinite-dimensional continuous settings. We prove several results giving general conditions under which conditional distributions are computable. In the discrete or dominated setting, under suitable computability hypotheses, conditional distributions are computable. Likewise, conditioning is a computable operation in the presence of certain additional structure, such as independent absolutely continuous noise.}
}

@inproceedings{GMR+08,
title = {Church: a language for generative models},
author = {Noah D. Goodman and Vikash K. Mansinghka and Daniel M. Roy and Keith Bonawitz and Joshua B. Tenenbaum},
booktitle = {Proc. of Uncertainty in Artificial Intelligence},
year = {2008},
url = {http://danroy.org/papers/church_GooManRoyBonTen-UAI-2008.pdf},
abstract = {Formal languages for probabilistic modeling enable re-use, modularity, and descriptive clarity, and can foster generic inference techniques. We introduce Church, a universal language for describing stochastic generative processes. Church is based on the Lisp model of lambda calculus, containing a pure Lisp as its deterministic subset. The semantics of Church is defined in terms of evaluation histories and conditional distributions on such histories. Church also includes a novel language construct, the stochastic memoizer, which enables simple description of many complex non-parametric models. We illustrate language features through several examples, including: a generalized Bayes net in which parameters cluster over trials, infinite PCFGs, planning by inference, and various non-parametric clustering models. Finally, we show how to implement query on any Church program, exactly and approximately, using Monte Carlo techniques.}
}

@inproceedings{FR10,
title = {Posterior distributions are computable from predictive distributions},
author = {Cameron E. Freer and Daniel M. Roy},
booktitle = {Proc. of the 13th Artificial Intelligence and Statistics},
editor = {Y. W. Teh and M. Titterington},
pages = {233-240},
location = {Chia Laguna, Sardinia, Italy},
year = {2010},
url = {http://danroy.org/papers/FreerRoy-AISTATS-2010.pdf},
abstract = {As we devise more complicated prior distributions, will inference algorithms keep up? We highlight a negative result in computable probability theory by Ackerman, Freer, and Roy (2010) that shows that there exist computable priors with noncomputable posteriors. In addition to providing a brief survey of computable probability theory geared towards the A.I. and statistics community, we give a new result characterizing when conditioning is computable in the setting of exchangeable sequences, and provide a computational perspective on work by Orbanz (2010) on conjugate nonparametric models. In particular, using a computable extension of de Finetti’s theorem (Freer and Roy 2009), we describe how to transform a posterior predictive rule for generating an exchangeable sequence into an algorithm for computing the posterior distribution of the directing random measure.}
}

@techreport{OGT09,
author = {Timothy J. O'Donnell and Noah D. Goodman and Joshua B. Tenenbaum},
title = {Fragment Grammars: Exploring Computation and Reuse in Language},
institution = {Massachusetts Institute of Technology},
number = {MIT-CSAIL-TR-2009-013},
url = {http://dspace.mit.edu/handle/1721.1/44963},
year = {2009},
abstract = {Language relies on a division of labor between stored units and structure building operations which combine the stored units into larger structures. This division of labor leads to a tradeoﬀ: more structure-building means less need to store while more storage means less need to compute structure. We develop a hierarchical Bayesian model called fragment grammar to explore the optimum balance between structure-building and reuse. The model is developed in the context of stochastic functional programming (SFP), and in particular, using a probabilistic variant of Lisp known as the Church programming language. We show how to formalize several probabilistic models of language structure using Church, and how fragment grammar generalizes one of them---adaptor grammars. We conclude with experimental data with adults and preliminary evaluations of the model on natural language corpus data.}
}

@phdthesis{Man09,
author = {Vikash Mansinghka},
title = {Natively Probabilistic Computation},
school = {Massachusetts Institute of Technology},
year = {2009},
url = {http://web.mit.edu/vkm/www/vkm-dissertation.pdf},
note = {MIT/EECS George M. Sprowls Doctoral Dissertation Award},
abstract = {I introduce a new set of natively probabilistic computing abstractions, including probabilistic generalizations of Boolean circuits, backtracking search and pure Lisp. I show how these tools let one compactly specify probabilistic generative models, generalize and parallelize widely used sampling algorithms like rejection sampling and Markov chain Monte Carlo, and solve difficult Bayesian inference problems.
I first introduce Church, a probabilistic programming language for describing probabilistic generative processes that induce distributions, which generalizes Lisp, a language for describing deterministic procedures that induce functions. I highlight the ways randomness meshes with the reflectiveness of Lisp to support the representation of structured, uncertain knowledge, including nonparametric Bayesian models from the current literature, programs for decision making under uncertainty, and programs that learn very simple programs from data. I then introduce systematic stochastic search, a recursive algorithm for exact and approximate sampling that generalizes a popular form of backtracking search to the broader setting of stochastic simulation and recovers widely used particle filters as a special case. I use it to solve probabilistic reasoning problems from statistical physics, causal reasoning and stereo vision. Finally, I introduce stochastic digital circuits that model the probability algebra just as traditional Boolean circuits model the Boolean algebra. I show how these circuits can be used to build massively parallel, fault-tolerant machines for sampling and allow one to efﬁciently run Markov chain Monte Carlo methods on models with hundreds of thousands of variables in real time.
I emphasize the ways in which these ideas fit together into a coherent software and hardware stack for natively probabilistic computing, organized around distributions and samplers rather than deterministic functions. I argue that by building uncertainty and randomness into the foundations of our programming languages and computing machines, we may arrive at ones that are more powerful, flexible and efficient than deterministic designs, and are in better alignment with the needs of computational science, statistics and artiﬁcial intelligence.}
}

@phdthesis{Roy11,
author = {Daniel M. Roy},
title = {Computability, inference and modeling in probabilistic programming},
school = {Massachusetts Institute of Technology},
year = {2011},
url = {http://danroy.org/papers/Roy-PHD-2011.pdf},
note = {MIT/EECS George M. Sprowls Doctoral Dissertation Award},
abstract = {We investigate the class of computable probability distributions and explore the fundamental limitations of using this class to describe and compute conditional distributions. In addition to proving the existence of noncomputable conditional distributions, and thus ruling out the possibility of generic probabilistic inference algorithms (even inefficient ones), we highlight some positive results showing that posterior inference is possible in the presence of additional structure like exchangeability and noise, both of which are common in Bayesian hierarchical modeling.
This theoretical work bears on the development of probabilistic programming languages (which enable the specification of complex probabilistic models) and their implementations (which can be used to perform Bayesian reasoning). The probabilistic programming approach is particularly well suited for defining infinite-dimensional, recursively-defined stochastic processes of the sort used in nonparametric Bayesian statistics. We present a new construction of the Mondrian process as a partition-valued Markov process in continuous time, which can be viewed as placing a distribution on an infinite $k$d-tree data structure.}
}

@phdthesis{O'D11,
author = {O'Donnell, Timothy J.},
school = {Harvard University},
title = {Productivity and Reuse in Language},
year = {2011},
abstract = {This thesis presents a formal model of productivity and reuse which treats the problem as a structure-by-structure inference in a Bayesian framework. The model---Fragment Grammars, a generalization of Adaptor Grammars (Johnson et al., 2007)---is built around two proposals. The first is that anything that can be computed can be stored. The specific computational mechanism by which this is accomplished, stochastic memoization, is inherited from Adaptor Grammars (Goodman et al., 2008; Johnson et al., 2007). The second proposal is that any stored item can include subparts which must be computed productively. This is made possible by the computational mechanism of stochastically lazy evaluation, introduced in the thesis.}
}

@phdthesis{Mil06,
author = {Brian Milch},
year = {2006},
title = {Probabilistic Models with Unknown Objects},
department = {Computer Science Division},
school = {University of California, Berkeley},
url = {http://sites.google.com/site/bmilch/papers/milch_thesis.pdf},
abstract = {Humans and other intelligent agents must make inferences about the real-world objects that underlie their observations: for instance, the objects visible in an image, or the people mentioned in a set of text documents. The agent may not know in advance how many objects exist, how they are related to each other, or which observations correspond to which underlying objects. Existing declarative representations for probabilistic models do not capture the structure of such scenarios.
This thesis introduces Bayesian logic (BLOG), a ﬁrst-order probabilistic modeling language that specifies probability distributions over possible worlds with varying sets of objects. A BLOG model contains statements that define conditional probability distributions for a certain set of random variables; the model also specifies certain context-speciﬁc independence properties. We provide criteria under which such a model is guaranteed to fully deﬁne a probability distribution. These criteria go beyond existing results in that they can be satisﬁed even when the Bayesian network deﬁned by the model is cyclic, or contains nodes with inﬁnitely many ancestors.
We describe several approximate inference algorithms that exploit the contextspeciﬁc dependence structure revealed by a BLOG model. First, we present rejection sampling and likelihood weighting algorithms that are guaranteed to converge to the correct probability for any query on a structurally well-defined BLOG model. Because these algorithms instantiate only those variables that are context-specifically relevant, they can generate samples in finite time even when the model defines infinitely many variables. We then deﬁne a general framework for inference on BLOG models using Markov chain Monte Carlo (MCMC) algorithms. This framework allows a programmer to plug in a domain-specfiic proposal distribution, which helps the Markov chain move to high-probability worlds. Furthermore, the chain can operate on partial world descriptions that specify values only for context-specifically relevant variables. We give conditions under which MCMC over such partial world descriptions is guaranteed to converge to correct probabilities. We also show that this framework performs efficiently on a real-world task: reconstructing the set of distinct publications referred to by a set of bibliographic citations.}
}

@article{KG07,
author = {Jouni Kerman and Andrew Gelman},
title = {Manipulating and summarizing posterior simulations using random variable objects},
journal = {Stat. Comput.},
year = {2007},
volume = {17},
pages = {235--244},
doi = {10.1007/s11222-007-9020-4},
url = {http://www.stat.columbia.edu/~gelman/research/published/postsim.pdf},
abstract = {Practical Bayesian data analysis involves manipulating and summarizing simulations from the posterior distribution of the unknown parameters. By manipulation we mean computing posterior distributions of functions of the unknowns, and generating posterior predictive distributions. The results need to be summarized both numerically and graphically.
We introduce, and implement in R, an object-oriented programming paradigm based on a random variable object type that is implicitly represented by simulations. This makes it possible to define vector and array objects that may contain both random and deterministic quantities, and syntax rules that allow to treat these objects like any numeric vectors or arrays, providing a solution to various problems encountered in Bayesian computing involving posterior simulations.
We illustrate the use of this new programming environment with examples of Bayesian computing, demonstrating missing-value imputation, nonlinear summary of regression predictions, and posterior predictive checking.}
}