@article{citeulike:2175802,
	abstract = {Valiant (SIAM J. Comput. 8 (1979) 410-421) showed that the problem of computing the number of simple s-t paths in graphs is \#P-complete both in the case of directed graphs and in the case of undirected graphs. Welsh (Complexity: Knots, Colourings and Counting, Cambridge University Press, Cambridge, 1993, p. 17) asked whether the problem of computing the number of self-avoiding walks of a given length in the complete two-dimensional grid is complete for \#P1, the tally-version of \#P. This paper offers a partial answer to the question of Welsh: it is \#P-complete to compute the number of self-avoiding walks of a given length in a subgraph of a two-dimensional grid. Several variations of the problem are also studied and shown to be \#P-complete. This paper also studies the problem of computing the number of self-avoiding walks in a subgraph of a hypercube. Similar completeness results are shown for the problem. By scaling the computation time to exponential, it is shown that computing the number of self-avoiding walks in hypercubes is a complete problem for \#EXP in the case when a subgraph of a hypercube is specified by its dimension and a boolean circuit that accepts the nodes. Finally, this paper studies the complexity of testing whether a given word over the four-letter alphabet {U,D,L,R} represents a self-avoiding walk in a two-dimensional grid. A linear-space lower bound is shown for nondeterministic Turing machines with a 1-way input head to make this test.},
	author = {Liskiewicz, Maciej   and Ogihara, Mitsunori   and Toda, Seinosuke  },
	citeulike-article-id = {2175802},
	comment = {- Counting SAW's is hard},
	doi = {10.1016/S0304-3975(03)00080-X},
	journal = {Theoretical Computer Science},
	keywords = {combinatorics, saw},
	month = {July},
	number = {1-3},
	pages = {129--156},
	priority = {2},
	title = {The complexity of counting self-avoiding walks in subgraphs of two-dimensional grids and hypercubes},
	url = {http://dx.doi.org/10.1016/S0304-3975(03)00080-X},
	volume = {304},
	year = {2003}
}


@techreport{citeulike:2175794,
	abstract = {In this thesis we consider two classical combinatorial problems arising in statistical mechanics:
counting matchings and self-avoiding walks in lattice graphs. The first problem
arises in the study of the thermodynamical properties of monomers and dimers (diatomic
molecules) in crystals. Fisher, Kasteleyn and Temperley discovered an elegant technique
to exactly count the number of perfect matchings in two dimensional lattices, but it is not
applicable for matchings of arbitrary size, or in...},
	address = {Berkeley, CA},
	author = {Randall, Dana  },
	citeulike-article-id = {2175794},
	comment = {- MCMC algorithm for generating self-avoiding walks (also had a follow up paper)},
	keywords = {combinatorics, saw},
	number = {TR-94-055},
	priority = {2},
	title = {Counting in Lattices: Combinatorial Problems from Statistical Mechanics},
	url = {http://citeseer.ist.psu.edu/randall94counting.html},
	year = {1994}
}


@article{citeulike:2175631,
	author = {Cipra, Barry  A. },
	citeulike-article-id = {2175631},
	comment = {- derivation of partition function as a generating function of "Eulerian subgraphs"
- how to count Eulerian subgraphs, up to 8x8 lattice
- Pereirl's proof that 2d Ising model has phase transition (uses non-backtracking random walks)
- Kramer/Wannier derivation of critical point for 2d Ising},
	journal = {The American Mathematical Monthly},
	keywords = {combinatorics, ising, physics},
	number = {10},
	pages = {937--959},
	priority = {2},
	title = {An Introduction to the Ising Model},
	url = {http://links.jstor.org/sici?sici=0002-9890\%28198712\%2994\%3A10\%3C937\%3AAITTIM\%3E2.0.CO\%3B2-V},
	volume = {94},
	year = {1987}
}


@misc{citeulike:2175617,
	author = {Haggkvist},
	citeulike-article-id = {2175617},
	comment = {- 2 page note on Ising and graph theory
- weird formulation of partition function -- For a graph G on n vertices and m edges, the Ising
partition function is defined as Sum\_{i,j} a\_ij x^i y^j where aij is the number of bipartitions of the vertices into parts of order (n-j)/2 and (n+j)/2, respectively, with (m-i)/2 edges between them.
- evaluate for x=-beta J, y=-beta H},
	keywords = {ising, physics},
	priority = {2},
	title = {Graph Theory and Statistical Physics},
	year = {1999}
}


@article{citeulike:2175581,
	abstract = {The Hamiltonian of the Ising model in one-, two- and three-dimensions has been analysed using unitary transformations and combinatorics. We have been able to obtain closed formulas for the eigenvalues of the Ising Hamiltonian for an arbitrary number of dimensions and sites. Although the solution provided assumes the absence of external magnetic fields an extension to include a magnetic field along the z-axis is readily extracted. Furthermore, generalisations to a higher number of spin components on each site are possible within this method. We made numerical comparisons with the partition function from the earlier analytical expressions known in the literature for one- and two-dimensional cases. We find complete agreement with these studies.},
	author = {Dixon, J. M.  and Tuszynski, J. A.  and Nip, M. L. A. },
	citeulike-article-id = {2175581},
	comment = {- Eigendecomposition of the path graph},
	doi = {10.1016/S0378-4371(00)00318-6},
	journal = {Physica A: Statistical Mechanics and its Applications},
	keywords = {ising, physics},
	month = {January},
	number = {1-2},
	pages = {137--156},
	priority = {2},
	title = {Exact eigenvalues of the Ising Hamiltonian in one-, two- and three-dimensions in the absence of a magnetic field},
	url = {http://dx.doi.org/10.1016/S0378-4371(00)00318-6},
	volume = {289},
	year = {2001}
}


@misc{citeulike:2174688,
	abstract = {The effects of an aperiodic order or a random disorder on phase transitions
in statistical mechanics are discussed. A heuristic relevance criterion based
on scaling arguments as well as specific results for Ising models with random
disorder or certain kinds of aperiodic order are reviewed. In particular, this
includes an exact real-space renormalization treatment of the Ising quantum
chains with coupling constants modulated according to substitution sequences,
related to a two-dimensional classical Ising model with layered disorder.},
	author = {Grimm, Uwe  },
	citeulike-article-id = {2174688},
	comment = {- transfer matrices for 2d ising
- renormalization group to compute aperiodic ising chain potentials},
	eprint = {cond-mat/0010392v1},
	keywords = {ising, physics},
	month = {Oct},
	priority = {2},
	title = {Aperiodicity and Disorder - Does it Matter?},
	url = {http://arxiv.org/abs/cond-mat/0010392v1},
	year = {2000}
}


@book{citeulike:2174620,
	abstract = {{<P>The book deals with selected modern aspects of artificially layered structures and bulk materials involving antiferromagnetic long-range order. Special emphasis is laid on the prototypical behavior of Ising-type model systems. They play a crucial role in the field of statistical physics and, in addition, contribute to the basic understanding of the exchange bias phenomenon in MBE-grown magnetic heterosystems.  Throughout the book, particular attention is given to the interplay between experimental results and their theoretical description, ranging from the famous Lee-Yang theory of phase transitions to novel mechanisms of exchange bias.</P>}},
	author = {Binek, Christian  },
	citeulike-article-id = {2174620},
	comment = {- p.11 "The Ising Hamiltonian can be derived from classical anisotropic Heisenberg expression in the limit of infinite positive single ion anisotropy"
- p.9 Lee-Yang theorem, distribution of complex zeros of Partition function},
	howpublished = {Hardcover},
	isbn = {3540404287},
	keywords = {ising, physics},
	month = {November},
	priority = {2},
	publisher = {Springer},
	title = {Ising-type Antiferromagnets: Model Systems in Statistical Physics and in the Magnetism of Exchange Bias (Springer Tracts in Modern Physics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/3540404287},
	year = {2003}
}


@book{citeulike:2174595,
	author = {Zvyagin, Andrei  A. },
	citeulike-article-id = {2174595},
	comment = {- Solutions of Ising model in 1d and 2d},
	howpublished = {Hardcover},
	isbn = {1860945031},
	keywords = {book, ising, physics},
	month = {June},
	priority = {2},
	publisher = {{Imperial College Press}},
	title = {Finite Size Effects in Correlated Electron Models: Exact Results},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/1860945031},
	year = {2005}
}


@book{citeulike:2174451,
	author = {Gaudin, Michel  },
	citeulike-article-id = {2174451},
	comment = {- diagonalizing Ising Hamiltonian},
	howpublished = {{Unknown Binding}},
	isbn = {2225796076},
	keywords = {book, ising, physics},
	month = {January},
	priority = {2},
	publisher = {Masson},
	title = {La fonction d'onde de Bethe (Collection du Commissariat a l'energie atomique)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/2225796076},
	year = {2007}
}


@book{citeulike:1280736,
	author = {Dirac, P. A. M. },
	citeulike-article-id = {1280736},
	comment = {- introduces bra-ket notation},
	howpublished = {Paperback},
	isbn = {0198520115},
	keywords = {book, physics},
	month = {February},
	priority = {2},
	publisher = {{Oxford University Press, USA}},
	title = {The Principles of Quantum Mechanics (International Series of Monographs on Physics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0198520115},
	year = {1982}
}


@book{citeulike:2174431,
	abstract = {{The <I>Encyclopedic Dictionary of Mathematics</I>, as put out by the Mathematical Society of Japan, is as complete and comprehensive an opus as one could wish for, concisely comprising in its two volumes all significant mathematical results, both pure and applied, elementary to advanced. This second edition is, basically, an English version of the acclaimed Japanese third edition. The <I>EDM2</I>, as it is known, succinctly but thoroughly covers math from A to Z, from Niels Henrik Abel and Abelian groups to Witt vectors and Zeta functions. Within its 2,000-plus pages are elegant explanations of diffusion processes, Fourier series, linear operators, and meromorphic functions. There are pages dedicated to quadratic fields and robust and nonparametric methods, and following each section, all the relevant references are listed. In addition, there are appendices with tables of formulas, numerical tables, and statistical tables, journals, publishers, and special notations, articles listed both systematically and alphabetically, plus a name index and an exhaustive subject index that's 231 pages long. It is a quality product--easily accessible, adhering to rigorous standards, and worth the investment for any school or personal math library. <I>--Stephanie Gold</I> } {When the first edition of the <I>Encyclopedic Dictionary of Mathematics </I>appeared in 1977, it was immediately hailed as a landmark contribution to mathematics: "The standard reference for anyone who wants to get acquainted with any part of the mathematics of our time" (Jean Dieudonn\'{e}, <I>American Mathematical Monthly</I>). "A magnificent reference work that belongs in every college and university library" (<i>Choice</i>), "This unique and masterfully written encyclopedia is more than just a reference work: it is a carefully conceived course of study in graduate-level mathematics" <I>(Library Journal)</I>.<br /> <br /> The new edition of the encyclopedia has been revised to bring it up to date and expanded to include more subjects in applied mathematics. There are 450 articles as compared to 436 in the first edition: 70 new articles have been added, whereas 56 have been incorporated into other articles and out-of-date material has been dropped. All the articles have been newly edited and revised to take account of recent work, and the extensive appendixes have been expanded to make them even more useful. The cross-referencing and indexing and the consistent set-theoretical orientation that characterized the first edition remain unchanged,<br /> <br /> The encyclopedia includes articles in the following areas: Logic and Foundations; Sets, General Topology, and Categories; Algebra; Group Theory; Number Theory; Euclidean and Projective Geometry; Differential Geometry; Algebraic Geometry; Topology; Analysis; Complex Analysis; Functional Analysis; Differential, Integral, and Functional Equations; Special Functions; Numerical Analysis; Computer Science and Combinatorics; Probability Theory; Statistics; Mathematical Programming and Operations Research; Mechanics and Theoretical Physics; History of Mathematics.<br /> <br /> Kiyosi Ito is professor emeritus of mathematics at Kyoto University.}},
	author = {Ito, Kiyoshi  },
	citeulike-article-id = {2174431},
	comment = {- p.1257 probabilistic methods in statistical mechanics
- contact process -- exists critical transmission rate after which infinite people get infected
- critical percolation probability for square, triangular, honeycomb lattices
- p.1302 Quantum Mechanics },
	howpublished = {Hardcover},
	isbn = {0262090260},
	keywords = {ising, mixing, physics},
	month = {June},
	priority = {2},
	publisher = {{The MIT Press}},
	title = {Encyclopedic Dictionary of Mathematics: Second Edition},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0262090260},
	year = {1987}
}


@book{citeulike:2174385,
	abstract = {{Aims to describe in simple terms the new area of statistical mechanics  known as spin-glasses, encompassing systems in which quenched disorder is  the dominant factor.}},
	author = {Dotsenko, Viktor  },
	citeulike-article-id = {2174385},
	comment = {- introduction to mean-field method, replica method},
	howpublished = {Hardcover},
	isbn = {9810218737},
	keywords = {book, ising, physics},
	month = {March},
	priority = {2},
	publisher = {{World Scientific Publishing Company}},
	title = {An Introduction to the Theory of Spin Glasses and Neural Networks (World Scientific Lecture Notes in Physics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/9810218737},
	year = {1995}
}


@book{citeulike:2174356,
	author = {Feynman, Richard  P. },
	citeulike-article-id = {2174356},
	comment = {- Sec.7.3 operator description of 1d Ising},
	howpublished = {Paperback},
	isbn = {0201360764},
	keywords = {ising, physics},
	month = {March},
	priority = {2},
	publisher = {{Perseus Books Group}},
	title = {Statistical Mechanics: A Set of Lectures (Advanced Book Classics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0201360764},
	year = {1998}
}


@book{citeulike:774034,
	abstract = {{User's guide and reference manual for the software LATEX. Updated to  reflect the changes in the new release. Includes a section on how to send  your LATEX documents electronically. Covers LATEX 2. Paper. DLC: LATEX  (Computer file)  }},
	author = {Lamport, Leslie  },
	citeulike-article-id = {774034},
	howpublished = {Paperback},
	isbn = {0201529831},
	keywords = {book},
	month = {June},
	priority = {2},
	publisher = {{Addison-Wesley Professional}},
	title = {LaTeX: A Document Preparation System (2nd Edition)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0201529831},
	year = {1994}
}


@book{citeulike:218038,
	author = {Hughes, B. },
	citeulike-article-id = {218038},
	comment = {- Ch.7 is on Self-Avoiding Walk
- sec.3.5 derives Green's function for walks with walks with internal states
- p.423 proof of existance of connective growth based on SAW subadditivity
- p.428 best bounds/estimates of connective constant
- p.430 simultaneous difference equations to find SAW number with finite memory
- p.445 asymptotic formula for number of SAW's between two points (Hammersley-Morton theorem)
- sec.7.6.1 gives equivalence between n-vector model and SAW's
- p.528 expresses susceptibility as a sum of two-point correlation functions (fluctuation-dissipation theorem)},
	howpublished = {Hardcover},
	isbn = {0198537883},
	keywords = {book, saw},
	month = {March},
	priority = {2},
	publisher = {{Clarendon Press}},
	title = {Random Walks and Random Environments: Random Walks Vol 1},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0198537883},
	year = {1995}
}


@book{citeulike:1345898,
	abstract = {{This text provides a thoroughly modern graduate-level introduction to the theory of critical behavior. Beginning with a brief review of phase transitions in simple systems and of mean field theory, the text then goes on to introduce the core ideas of the renormalization group.  Following chapters cover phase diagrams, fixed points, cross-over behavior, finite-size scaling, perturbative renormalization methods, low-dimensional systems, surface critical behavior, random systems, percolation, polymer statistics, critical dynamics and conformal symmetry.  The book closes with an appendix on Gaussian integration, a selected bibliography, and a detailed index.  Many problems are included.  The emphasis throughout is on providing an elementary and intuitive approach.  In particular, the perturbative method introduced leads, among applications, to a simple derivation of the epsilon expansion in which all the actual calculations (at least to lowest order) reduce to simple counting, avoiding the need for Feynman diagrams.}},
	author = {Cardy, John  },
	citeulike-article-id = {1345898},
	comment = {- sec.2.1 intro to Mean Field for 1d Ising
- mean field correlation function (exponential with power law correction)
- exact solution of a renormalization group example for 1D Ising
- p.172 Eulerian subgraph representation for Z of O(n) model (self-avoiding loops, generalized loops), eq. 9.7
- representation of correlation between two points in O(n) model as sum over all self-avoiding walks},
	howpublished = {Paperback},
	isbn = {0521499593},
	keywords = {book, ising, physics},
	month = {April},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Scaling and Renormalization in Statistical Physics (Cambridge Lecture Notes in Physics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521499593},
	year = {1996}
}


@book{citeulike:2168408,
	abstract = {ful for her permission to include
this material.

Again Sren Asmussen and Sren Tolver Jensen have found errors and
made many detailed suggestions for improvements in the presentation. Helle
Andersen and Helle Westmark have helped transforming the typescript into
a typeset document. Sren L. Buhl has given me very helpful comments on
this manuscript. Thank you.

Aalborg, August 1989

Ste\#en L. Lauritzen

Preface to the electronic edition

The only essential changes from the third edition to this...},
	address = {Aalborg, Denmark},
	author = {Lauritzen, S. },
	citeulike-article-id = {2168408},
	comment = {(private-note)- Decomposable/hierarchical models, proof of Hammersley-Clifford using Moebius inversion},
	keywords = {graphical},
	priority = {2},
	publisher = {University of Aalborg Press},
	title = {Lectures on Contingency Tables},
	url = {http://citeseer.ist.psu.edu/lauritzen02lectures.html},
	year = {1982}
}


@article{citeulike:438874,
	abstract = {This paper presents a tutorial on those aspects of floating-point that have a direct impact on designers of computer systems. It begins with background on floating-point representation and rounding error, continues with a discussion of the IEEE floating-point standard, and concludes with numerous examples of how computer builders can better support floating-point.},
	author = {Goldberg, David  },
	citeulike-article-id = {438874},
	comment = {- IEEE 754 explanation
- benign vs. catastrophic cancellation, guard digit},
	journal = {ACM Computing Surveys},
	keywords = {numerical-analysis},
	number = {1},
	pages = {5--48},
	priority = {2},
	title = {What Every Computer Scientist Should Know About Floating-Point Arithmetic},
	url = {http://citeseer.ist.psu.edu/goldberg91what.html},
	volume = {23},
	year = {1991}
}


@book{citeulike:2168009,
	abstract = {{Including many algorithms described in simple terms, this book stresses common techniques (such as generating functions and recursive construction) that underlie the great variety of subject matter.}},
	author = {Cameron, Peter  J. },
	citeulike-article-id = {2168009},
	comment = {- Ch. 12 Moebius inversion for boolean algebra, Arrow's theorem},
	howpublished = {Paperback},
	isbn = {0521457610},
	keywords = {book, combinatorics},
	month = {January},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Combinatorics: Topics, Techniques, Algorithms},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521457610},
	year = {1995}
}


@book{citeulike:2164569,
	abstract = {{<b>The essential guide to MATLAB as a problem solving tool</b><br><br>This text presents MATLAB both as a mathematical tool and a programming language, giving a concise and easy to master introduction to its potential and power. Stressing the importance of a structured approach to problem solving, the text gives a step-by-step method for program design and algorithm development. The fundamentals of MATLAB are illustrated throughout with many examples from a wide range of familiar scientific and engineering areas, as well as from everyday life.<br><br>Features:<br><br> Numerous simple exercises provide hands-on learning of MATLABs functions<br> A new chapter on dynamical systems shows how a structured approach is used to solve more complex problems.<br> Common errors and pitfalls highlighted<br> Concise introduction to useful topics for solving problems in later engineering and science courses: vectors as arrays, arrays of characters, GUIs, advanced graphics, simulation and numerical methods<br> Text and graphics in four colour<br> Extensive instructor support<br><br><b><i>Essential MATLAB for Engineers and Scientists</i></b> is an ideal textbook for a first course on MATLAB or an engineering problem solving course using MATLAB, as well as a self-learning tutorial for students and professionals expected to learn and apply MATLAB for themselves.<br><br>Additional material is available for lecturers only at http://textbooks.elsevier.com. This website provides lecturers with:<li>A series of Powerpoint presentations to assist lecture preparation<li>Extra quiz questions and problems<li>Additional topic material<li>M-files for the exercises and examples in the text (also available to students at the books companion site)<li>Solutions to exercises<li>An interview with the revising author, Daniel Valentine<br><br>· Numerous simple exercises give hands-on learning<br>· A chapter on algorithm development and program design <br>· Common errors and pitfalls highlighted<br>· Concise introduction to useful topics for solving problems in later engineering and science courses: vectors as arrays, arrays of characters, GUIs, advanced graphics, simulation and numerical methods<br>· A new chapter on dynamical systems shows how a structured approach is used to solve more complex problems.<br>· Text and graphics in four colour<br>· Extensive teacher support on http://textbooks.elsevier.com: solutions manual, extra problems, multiple choice questions, PowerPoint slides<br>· Companion website for students providing M-files used within the book}},
	author = {Hahn, Brian   and Valentine, Dan  },
	citeulike-article-id = {2164569},
	comment = {- Sec.7 list of visualization approaches},
	howpublished = {Paperback},
	isbn = {0750684178},
	keywords = {book},
	month = {March},
	priority = {2},
	publisher = {Newnes},
	title = {Essential MATLAB for Engineers and Scientists, Third Edition},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0750684178},
	year = {2007}
}


@article{citeulike:2164570,
	abstract = {The entropy contraction coefficient associated with the convex function g and column-stochastic matrix A is defined as [eta]g(A)=supx[not equal to]yHg(Ax,Ay)/Hg(x,y), where Hg(x,y) denotes the relative g-entropy of the probability vectors x and y. We show that, for each fixed A, all entropy contraction coefficients associated with convex operator functions are equivalent. In particular, [eta]log(A)=[eta]w2(A)[for all]A, where the subscripts log and w2 refer, respectively, to the usual logarithmic relative entropy and to the relative g-entropy for the function g(w)=w2. We also give a useful new characterization of [eta]w2(A).},
	author = {Choi, Man-Duen   and Ruskai, Mary  B.  and Seneta, Eugene  },
	citeulike-article-id = {2164570},
	comment = {- Contraction of all g-entropies is the same (ie, Shannon entropy vs. sum of squares)},
	doi = {10.1016/0024-3795(94)90428-6},
	journal = {Linear Algebra and its Applications},
	keywords = {ergodicity},
	pages = {29--36},
	priority = {2},
	title = {Equivalence of certain entropy contraction coefficients},
	url = {http://dx.doi.org/10.1016/0024-3795(94)90428-6},
	volume = {208-209}
}


@article{citeulike:502089,
	abstract = {A pollinator that restricts its visits to one flower type, even when other rewarding types are accessible, can be said to exhibit flower constancy. This usage distinguishes constancy from fixed preference or labile preference for the most rewarding flower type; I discuss a quantitative constancy index that is insensitive to preference changes. Because a constant visitor avoids flowers with acceptable rewards, the behavior is inefficient unless there are constraints such as an inability to learn quickly or to remember simultaneously how to deal with many flower types. If such constraints are the basis for constancy, it should be most pronounced when flowers in a mixture differ strongly in morphology or color. I observed bees foraging in outdoor flower arrays and found that constancy always increased with increasing differences among flower types; similar results can be gleaned from one other study. The available experimental evidence thus suggests that constancy reflects behavioral constraints.},
	author = {Waser, Nickolas  M. },
	citeulike-article-id = {502089},
	comment = {Bateman's index (same as Birkhoff's coefficient of ergodicity)},
	journal = {The American Naturalist},
	keywords = {ergodicity},
	number = {5},
	pages = {593--603},
	priority = {2},
	title = {Flower Constancy: Definition, Cause, and Measurement},
	url = {http://links.jstor.org/sici?sici=0003-0147\%28198605\%29127\%3A5\%3C593\%3AFCDCAM\%3E2.0.CO\%3B2-J},
	volume = {127},
	year = {1986}
}


@article{citeulike:2162627,
	author = {Knopp and Sinkhorn},
	citeulike-article-id = {2162627},
	comment = {- IPF converges to doubly-stochastic matrix iff A has 1+ positive diagonals
- mini-summary at http://groups.google.com/group/sci.math/browse\_thread/thread/3c4919dd66e1175b\#},
	journal = {Pacific Journal of Mathematics},
	keywords = {statistics},
	priority = {2},
	title = {Concerning nonnegative matrices and doubly stochastic matrices.},
	url = {http://projecteuclid.org/DPubS?service=UI\&\#38;version=1.0\&\#38;verb=Display\&\#38;handle=euclid.pjm/1102992505},
	year = {1967}
}


@article{citeulike:2154280,
	address = {Oxford, UK},
	author = {Bilmes, Jeff  A. },
	citeulike-article-id = {2154280},
	comment = {- 4.7 State duration for HMM is negative binomial, geometric for Markov chain
- Conditions on distribution for HMM to be accurate},
	doi = {10.1093/ietisy/e89-d.3.869},
	issn = {0916-8532},
	journal = {IEICE - Trans. Inf. Syst.},
	keywords = {hmm},
	number = {3},
	pages = {869--891},
	priority = {2},
	publisher = {Oxford University Press},
	title = {What HMMs Can Do},
	url = {http://portal.acm.org/citation.cfm?id=1184964},
	volume = {E89-D},
	year = {2006}
}


@inproceedings{citeulike:2157295,
	abstract = {In this paper we prove that the well-known correspondence between the forward-backward algorithm for hidden Markov models (HMMs) and belief propagation (BP) applied to HMMs can be generalized to one between BP for junction trees and the generalized inside-outside probability computation for probabilistic logic programs applied to junction trees.},
	author = {Sato, Taisuke  },
	citeulike-article-id = {2157295},
	comment = {- inside-outside = sum product},
	keywords = {graphical, message-passing},
	priority = {2},
	title = {Inside-Outside Probability Computation for Belief Propagation},
	year = {2007}
}


@article{citeulike:652410,
	author = {Osius and Gerhard},
	citeulike-article-id = {652410},
	comment = {Proves that cross-product ratios and marginals are sufficient to reconstruct joint},
	doi = {10.1007/s001840300309},
	issn = {0026-1335},
	journal = {Metrika},
	keywords = {statistics},
	month = {November},
	number = {3},
	pages = {261--277},
	priority = {2},
	publisher = {Springer},
	title = {The association between two random elements: A complete characterization and odds ratio models},
	url = {http://dx.doi.org/10.1007/s001840300309},
	volume = {60},
	year = {2004}
}


@article{citeulike:2153574,
	author = {Sinkhorn, Richard  },
	citeulike-article-id = {2153574},
	comment = {- proves IPF converges to doubly stochastic matrix for positive matrices
- prequel to 1967 Sinkhorn paper},
	journal = {The Annals of Mathematical Statistics},
	keywords = {statistics},
	number = {2},
	pages = {876--879},
	priority = {2},
	title = {A Relationship Between Arbitrary Positive Matrices and Doubly Stochastic Matrices},
	url = {http://links.jstor.org/sici?sici=0003-4851\%28196406\%2935\%3A2\%3C876\%3AARBAPM\%3E2.0.CO\%3B2-G},
	volume = {35},
	year = {1964}
}


@incollection{NIPS2006_257,
	address = {Cambridge, MA},
	author = {Zass, Ron   and Shashua, Amnon  },
	booktitle = {Advances in Neural Information Processing Systems 19},
	citeulike-article-id = {2153567},
	comment = {- successive projections to find closest doubly-stochastic matrix
- reference to paper proving IPF converges to doubly stochastic},
	editor = {Sch\"{o}lkopf, B.  and Platt, J.  and Hoffman, T. },
	keywords = {bibtex-import},
	pages = {1569--1576},
	priority = {2},
	publisher = {MIT Press},
	title = {Doubly Stochastic Normalization for Spectral Clustering},
	year = {2007}
}


@book{citeulike:2153548,
	author = {Roberts, Arthur  W.  and Varberg, Dale  },
	citeulike-article-id = {2153548},
	comment = {- p.200 proof that doubly stochastic matrices are convex hull of permutation matrices},
	howpublished = {Hardcover},
	isbn = {0125897405},
	keywords = {linear-algebra},
	month = {December},
	priority = {2},
	publisher = {{Academic Press Inc.,U.S.}},
	title = {Convex Functions (Pure \& Applied Mathematics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0125897405},
	year = {1973}
}


@article{citeulike:2153411,
	abstract = {The generalization of Edwards's argument for the measure of association of the rows and columns of 2 \&\#215; 2 table, to that of an r \&\#215; s table whose rows and columns are assumed unordered, shows, not surprisingly, that association ought to be measured by some function of the (r - 1)(s - 1) cross-ratios. Such a function is suggested by the introduction of a metric on certain equivalence classes. The properties of such metrics are examined, and in particular comparisons are made with Good's suggestion of the use of the algebraic rank of the contingency table, and with Lindley's significance test for association in the r \&\#215; s table.},
	author = {Altham, Patricia  M. E. },
	citeulike-article-id = {2153411},
	comment = {- page 68, equation IX, Birkhoff contraction coefficient (Yule's measure of colligation)
- Birkhoff contraction coefficient as distance from rank-1 matrix on the space where matrix equivalence is defined in terms of cross-product ratios},
	keywords = {statistics},
	priority = {2},
	title = {The Measurement of Association of Rows and Columns for an r \&\#215; s Contingency Table},
	url = {http://links.jstor.org/sici?sici=0035-9246\%281970\%2932\%3A1\%3C63\%3ATMOAOR\%3E2.0.CO\%3B2-Q}
}


@book{citeulike:2153366,
	abstract = {{Clearly reviews the properities of important contemporary measures of association and correlation.  Devotes full chapters to measures for nominal, ordinal, and continuous data, paying special attention to the sampling distributions needed to determine levels of significance and confidence intervals.}},
	author = {Liebetrau, Albert  M. },
	citeulike-article-id = {2153366},
	comment = {- Yule's measure of colligation for contingency tables},
	howpublished = {Paperback},
	isbn = {0803919743},
	keywords = {book, statistics},
	month = {April},
	priority = {2},
	publisher = {{Sage Publications, Inc}},
	title = {Measures of Association (Quantitative Applications in the Social Sciences)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0803919743},
	year = {1983}
}


@book{citeulike:2153334,
	abstract = {{This book provides an up-to-date account of the theory and applications of linear models. It can be used as a text for courses in statistics at the graduate level as well as an accompanying text for other courses in which linear models play a part. The authors present a unified theory of inference from linear models with minimal assumptions, not only through least squares theory, but also using alternative methods of estimation and testing based on convex loss functions and general estimating equations. Some of the highlights include: - a special emphasis on sensitivity analysis and model selection; - a chapter devoted to the analysis of categorical data based on logit, loglinear, and logistic regression models; - a chapter devoted to incomplete data sets; - an extensive appendix on matrix theory, useful to researchers in econometrics, engineering, and optimization theory; - a chapter devoted to the analysis of categorical data based on a unified presentation of generalized linear models including GEE- methods for correlated response; - a chapter devoted to incomplete data sets including regression diagnostics to identify Non-MCAR-processes The material covered will be invaluable not only to graduate students, but also to research workers and consultants in statistics. Helge Toutenburg is Professor for Statistics at the University of Muenchen. He has written about 15 books on linear models, statistical methods in quality engineering, and the analysis of designed experiments. His main interest is in the application of statistics to the fields of medicine and engineering.}},
	author = {Rao, C. R.  and Toutenburg, Helge  },
	citeulike-article-id = {2153334},
	comment = {- odds-ratio for NxM tables (set of all possible 2x2 odds ratios)},
	howpublished = {Hardcover},
	isbn = {0387988483},
	keywords = {book, statistics},
	month = {July},
	priority = {2},
	publisher = {Springer},
	title = {Linear Models: Least Squares and Alternatives (Springer Series in Statistics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387988483},
	year = {1999}
}


@incollection{citeulike:2051282,
	author = {Sutton, Charles   and Mccallum, Andrew  },
	booktitle = {Introduction to Statistical Relational Learning},
	citeulike-article-id = {2051282},
	editor = {Getoor, Lise   and Taskar, Ben  },
	keywords = {crf},
	priority = {2},
	publisher = {MIT Press},
	title = {An Introduction to Conditional Random Fields for Relational Learning},
	url = {publications/crf-tutorial.pdf},
	year = {2006}
}


@book{citeulike:163662,
	abstract = {{Convex optimization problems arise frequently in many different fields. A comprehensive introduction to the subject, this book shows in detail how such problems can be solved numerically with great efficiency. The focus is on recognizing convex optimization problems and then finding the most appropriate technique for solving them. The text contains many worked examples and homework exercises and will appeal to students, researchers and practitioners in fields such as engineering, computer science, mathematics, statistics, finance, and economics.}},
	author = {Boyd, Stephen   and Vandenberghe, Lieven  },
	citeulike-article-id = {163662},
	howpublished = {Hardcover},
	isbn = {0521833787},
	keywords = {book, optimization},
	month = {March},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Convex Optimization},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521833787},
	year = {2004}
}


@book{citeulike:308856,
	abstract = {{This book provides a solid statistical foundation for neural networks from a pattern recognition  perspective. The focus is on the types of neural nets that are most widely used in practical applications,  such as the multi-layer perceptron and radial basis function networks. Rather than trying to cover many  different types of neural networks, Bishop thoroughly covers topics such as density estimation, error  functions, parameter optimization algorithms, data pre-processing, and Bayesian methods. All topics are  organized well and all mathematical foundations are explained before being applied to neural networks.  The text is suitable for a graduate or advanced undergraduate level course on neural networks or for  practitioners interested in applying neural networks to real-world problems. The reader is assumed to have  the level of math knowledge necessary for an undergraduate science degree.} {This is the first comprehensive treatment of feed-forward neural networks from the perspective of statistical pattern recognition. After introducing the basic concepts, the book examines techniques for modelling probability density functions and the properties and merits of the multi-layer perceptron and radial basis function network models. Also covered are various forms of error functions, principal algorithms for error function minimalization, learning and generalization in neural networks, and Bayesian techniques and their applications.  Designed as a text, with over 100 exercises, this fully up-to-date work will benefit anyone involved in the fields of neural computation and pattern recognition.}},
	author = {Bishop, Christopher  M. },
	citeulike-article-id = {308856},
	comment = {- curse of dimensionality calculations},
	howpublished = {Paperback},
	isbn = {0198538642},
	keywords = {book, machine-learning},
	month = {November},
	priority = {2},
	publisher = {{Oxford University Press}},
	title = {Neural Networks for Pattern Recognition},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0198538642},
	year = {1995}
}


@book{citeulike:148945,
	abstract = {{This book covers the field of machine learning, which is the study of algorithms that allow computer programs to automatically improve through experience. The book is intended to support upper level undergraduate and introductory level graduate courses in machine learning.}},
	author = {Mitchell, Tom  M. },
	citeulike-article-id = {148945},
	comment = {- gentle introduction},
	howpublished = {Hardcover},
	isbn = {0070428077},
	keywords = {book},
	month = {March},
	priority = {2},
	publisher = {{McGraw-Hill Science/Engineering/Math}},
	title = {Machine Learning},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0070428077},
	year = {1997}
}


@book{citeulike:2139497,
	author = {Fukunaga, Keinosuke  },
	citeulike-article-id = {2139497},
	comment = {- gives calculations of expected error for high dimensional Gaussians (Also in Bishop's book)},
	howpublished = {Hardcover},
	isbn = {0122698509},
	keywords = {book},
	priority = {2},
	publisher = {Academic Press},
	title = {Introduction to Statistical Pattern Recognition},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0122698509}
}


@book{citeulike:308696,
	abstract = {{Following a brief introduction and overview, early chapters cover the basic algebraic relationships of entropy, relative entropy and mutual information, AEP, entropy rates of stochastics processes and data compression, duality of data compression and the growth rate of wealth. Later chapters explore Kolmogorov complexity, channel capacity, differential entropy, the capacity of the fundamental Gaussian channel, the relationship between information theory and statistics, rate distortion and network information theories. The final two chapters examine the stock market and inequalities in information theory. In many cases the authors actually describe the properties of the solutions before the presented problems.}},
	author = {Cover, Thomas  M.  and Thomas, Joy  A. },
	citeulike-article-id = {308696},
	howpublished = {Hardcover},
	isbn = {0471062596},
	keywords = {book},
	month = {August},
	priority = {2},
	publisher = {Wiley-Interscience},
	title = {Elements of Information Theory},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0471062596},
	year = {1991}
}


@book{citeulike:161814,
	abstract = {{During the past decade there has been an explosion in computation and information technology. With it has come vast amounts of data in a variety of fields such as medicine, biology, finance, and marketing. The challenge of understanding these data has led to the development of new tools in the field of statistics, and spawned new areas such as data mining, machine learning, and bioinformatics.   Many of these tools have common underpinnings but are often expressed with different terminology. This book descibes theimprtant ideas in these areas ina common conceptual framework. While the approach is statistical, the emphasis is on concepts rather than mathematics. Many examples are given, with a liberal use of color graphics. It should be a vluable resource for statisticians and anyone interested in data mining in science or industry.   The book's coverage is broad, from supervised learing (prediction) to unsupervised learning. The many topics include neural networks, support vector machines, classification trees and boosting--the first comprehensive treatment of this topic in any book.   Trevor Hastie, Robert Tibshirani, and Jerome Friedman are professors of statistics at Stanford University. They are prominent researchers in this area: Hastie and Tibshirani developed generalized additive models and wrote a popular book of that title. Hastie wrote much of the statistical modeling software in S-PLUS and invented principal curves and surfaces. Tibshirani proposed the Lasso and is co-author of the very successful An Introduction to the Bootstrap. Friedman is the co-inventor of many data-mining tools including CART, MARS, and projection pursuit.}},
	author = {Hastie, T.  and Tibshirani, R.  and Friedman, J. H. },
	citeulike-article-id = {161814},
	howpublished = {Hardcover},
	isbn = {0387952845},
	keywords = {book},
	month = {August},
	priority = {2},
	publisher = {Springer},
	title = {The Elements of Statistical Learning},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387952845},
	year = {2001}
}


@misc{citeulike:2139481,
	abstract = {Introduction

An alternating-sign matrix of order n is an n-by-n array of 0's, +1's and 1's with the property that in
each row and each column, the non-zero entries alternate in sign, beginning and ending with a +1. For
example, Figure 1 shows an alternating-sign matrix (ASM for short) of order 4.

0
B
B
@

0 +1 0 0

+1 1 +1 0
0 0 0 +1

0 +1 0 0

1
C
C
A
Figure 1: An alternating-sign matrix of order 4.
Figure 2 exhibits all seven of the ASMs of order 3.

0
@

0 0 +1

0 +1 0

+1 0 0

1
A
0
@

0...},
	author = {Propp, J. },
	citeulike-article-id = {2139481},
	comment = {- Equivalence with square ice, gasket/basket, loop packing models},
	keywords = {combinatorics},
	priority = {2},
	title = {The many faces of alternating-sign matrices},
	url = {http://citeseer.ist.psu.edu/447933.html}
}


@book{citeulike:969205,
	author = {Royden, Halsey  },
	citeulike-article-id = {969205},
	howpublished = {Hardcover},
	isbn = {0024041513},
	keywords = {book},
	month = {February},
	priority = {2},
	publisher = {{Prentice Hall}},
	title = {Real Analysis (3rd Edition)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0024041513},
	year = {1988}
}


@book{citeulike:1289400,
	abstract = {{In each generation, scientists must redefine their fields: abstracting, simplifying and distilling the previous standard topics to make room for new advances and methods. Sethna's book takes this step for statistical mechanics - a field rooted in physics and chemistry whose ideas and methods are now central to information theory, complexity, and modern biology. Aimed at advanced undergraduates and early graduate students in all of these fields, Sethna limits his main presentation to the topics that future mathematicians and biologists, as well as physicists and chemists, will find fascinating and central to their work. The amazing breadth of the field is reflected in the author's large supply of carefully crafted exercises, each an introduction to a whole field of study: everything from chaos through information theory to life at the end of the universe.}},
	author = {Sethna, James  P. },
	citeulike-article-id = {1289400},
	comment = {- green's functions, self-avoiding walks, mentions Ising connection
- intro to correlation functions
- appendix on Fourier methods},
	howpublished = {Paperback},
	isbn = {0198566778},
	keywords = {book, fun, ising, physics, saw},
	month = {May},
	priority = {2},
	publisher = {{Oxford University Press, USA}},
	title = {Statistical Mechanics: Entropy, Order Parameters and Complexity (Oxford Master Series in Physics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0198566778},
	year = {2006}
}


@misc{citeulike:2137748,
	abstract = {We construct Markov chain algorithms for sampling from discrete exponential families
conditional on a sufficient statistic. Examples include generating tables with fixed row and
column sums and higher dimensional analogs. The algorithms involve finding bases for
associated polynomial ideals and so an excursion into computational algebraic geometry.},
	author = {Diaconis, P.  and Sturmfels, B. },
	citeulike-article-id = {2137748},
	comment = {- MCMC for sampling doubly stochastic matrices (icml/scratch.nb)},
	keywords = {statistics},
	priority = {2},
	title = {Algebraic algorithms for sampling from conditional distributions},
	url = {http://citeseer.ist.psu.edu/442657.html},
	year = {1995}
}


@article{citeulike:2120482,
	author = {Kupiainen},
	citeulike-article-id = {2120482},
	comment = {shows finite correlation length for high temperature expansion, P runs over self-avoiding paths},
	keywords = {book, ising, physics},
	priority = {2},
	title = {Introduction to the renormalization group.}
}


@article{citeulike:474507,
	abstract = {Algorithms that must deal with complicated global functions of many variables often exploit the manner in which the given functions factor as a product of \&ldquo;local\&rdquo; functions, each of which depends on a subset of the variables. Such a factorization can be visualized with a bipartite graph that we call a factor graph, In this tutorial paper, we present a generic message-passing algorithm, the sum-product algorithm, that operates in a factor graph. Following a single, simple computational rule, the sum-product algorithm computes-either exactly or approximately-various marginal functions derived from the global function. A wide variety of algorithms developed in artificial intelligence, signal processing, and digital communications can be derived as specific instances of the sum-product algorithm, including the forward/backward algorithm, the Viterbi algorithm, the iterative \&ldquo;turbo\&rdquo; decoding algorithm, Pearl's (1988) belief propagation algorithm for Bayesian networks, the Kalman filter, and certain fast Fourier transform (FFT) algorithms},
	author = {Kschischang, F. R.  and Frey, B. J.  and Loeliger, H. A. },
	citeulike-article-id = {2120133},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {bibtex-import, message-passing},
	number = {2},
	pages = {498--519},
	priority = {2},
	title = {Factor graphs and the sum-product algorithm},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=910572},
	volume = {47},
	year = {2001}
}


@article{citeulike:474266,
	abstract = {Algorithms that must deal with complicated global
functions of many variables often exploit the manner in which the
given functions factor as a product of \&quot;local\&quot; functions, each of
which depends on a subset of the variables. Such a factorization
can be visualized with a bipartite graph that we call a factor graph.
In this tutorial paper, we present a generic message-passing algorithm,
the sum-product algorithm, that operates in a factor graph.
Following a single, simple computational rule, the ...},
	author = {Kschischang and Frey and Loeliger},
	citeulike-article-id = {474266},
	journal = {IEEETIT: IEEE Transactions on Information Theory},
	keywords = {message-passing},
	priority = {2},
	title = {Factor Graphs and the Sum-Product Algorithm},
	url = {http://citeseer.ist.psu.edu/kschischang01factor.html},
	volume = {47},
	year = {2001}
}


@article{citeulike:474507,
	abstract = {Algorithms that must deal with complicated global functions of many variables often exploit the manner in which the given functions factor as a product of \&ldquo;local\&rdquo; functions, each of which depends on a subset of the variables. Such a factorization can be visualized with a bipartite graph that we call a factor graph, In this tutorial paper, we present a generic message-passing algorithm, the sum-product algorithm, that operates in a factor graph. Following a single, simple computational rule, the sum-product algorithm computes-either exactly or approximately-various marginal functions derived from the global function. A wide variety of algorithms developed in artificial intelligence, signal processing, and digital communications can be derived as specific instances of the sum-product algorithm, including the forward/backward algorithm, the Viterbi algorithm, the iterative \&ldquo;turbo\&rdquo; decoding algorithm, Pearl's (1988) belief propagation algorithm for Bayesian networks, the Kalman filter, and certain fast Fourier transform (FFT) algorithms},
	author = {Kschischang, F. R.  and Frey, B. J.  and Loeliger, H. A. },
	citeulike-article-id = {474507},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {message-passing},
	number = {2},
	pages = {498--519},
	priority = {2},
	title = {Factor graphs and the sum-product algorithm},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=910572},
	volume = {47},
	year = {2001}
}


@article{citeulike:80546,
	abstract = {We study the problem of learning multiple tasks in parallel within the online learning framework. On each online round, the algorithm receives an instance for each of the parallel tasks and responds by predicting the label of each instance. We consider the case where the predictions made on each round all contribute toward a common goal. The relationship between the various tasks is defined by a global loss function, which evaluates the overall quality of the multiple predictions made on each round. Specifically, each individual prediction is associated with its own loss value, and then these multiple loss values are combined into a single number using the global loss function. We focus on the case where the global loss function belongs to the family of absolute norms, and present several online learning algorithms for the induced problem. We prove worst-case relative loss bounds for all of our algorithms, and demonstrate the effectiveness of our approach on a large-scale multiclass-multilabel text categorization problem.},
	author = {Dekel, Ofer   and Long, Philip  M.  and Singer, Yoram  },
	citeulike-article-id = {80546},
	journal = {Journal of Machine Learning Research},
	keywords = {test},
	month = {October},
	pages = {2233--2264},
	priority = {2},
	title = {Online Learning of Multiple Tasks with a Shared Loss},
	url = {\#},
	volume = {8},
	year = {2007}
}


@book{citeulike:366370,
	abstract = {{This book is primarily aimed at graduate students and researchers in graph theory, combinatorics, or discrete mathematics in general. However, all the necessary graph theory is developed from scratch, so the only pre-requisite for reading it is a first course in linear algebra and a small amount of elementary group theory.  It should be accessible to motivated upper-level undergraduates.}},
	author = {Godsil, Chris   and Royle, Gordon  },
	citeulike-article-id = {366370},
	comment = {- Chapter 8 (Matrix theory) Example of cospectral non-isomorphic graphs. Spectrum determines edges and rectangles.
- Rank r implies decomposition into r-2s symmetric rank 1 matrices, s symmetric rank 2 matrices
- GF(2) rank-1 decompositions, GF(2) rank carries more information than regular rank},
	howpublished = {Paperback},
	isbn = {0387952209},
	keywords = {book, graph-theory},
	month = {April},
	priority = {2},
	publisher = {Springer},
	title = {Algebraic Graph Theory},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387952209},
	year = {2001}
}


@book{citeulike:2111390,
	abstract = {{<div><div>This text explores the solution of two-dimensional lattice models. Topics include basic statistical mechanics, Ising models, the mean field model, the spherical model, ice-type models, corner transfer matrices, hard hexagonal models, and elliptic functions. The author has updated the 1989 version with a new chapter, "Subsequent Developments," for the 2007 edition.</div></div>}},
	author = {Baxter, Rodney  J. },
	citeulike-article-id = {2111390},
	comment = {- Basic concepts (phase transition, susceptibility)
- Full derivation of 1d ising},
	howpublished = {Paperback},
	isbn = {0486462714},
	keywords = {book, ising, physics},
	month = {December},
	priority = {2},
	publisher = {{Dover Publications}},
	title = {Exactly Solved Models in Statistical Mechanics},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0486462714},
	year = {2007}
}


@book{citeulike:150268,
	abstract = {Fundamentals 1

Electrical Networks 39

Flows, Connectivity and matching 67 

Extremal Problems 103

Coloring

Ramsey Theory 181

Random Graphs 215

Graphs, Groups and Matrices 253

Random walks on Graphs

The Tutte Polynomial 335},
	author = {Bollobas, Bela  },
	citeulike-article-id = {150268},
	comment = {- recommended by Chris Hillman, esp. section on Tutte polynomial
- Random walk/electrical network connection
- Worked example of getting resistance from Kirchoff laws directly
- Proof of Pott's partition function = dichromatic polynomial},
	howpublished = {Paperback},
	isbn = {0387984887},
	keywords = {book, graph-theory, resistance},
	month = {July},
	priority = {2},
	publisher = {Springer},
	title = {Modern Graph Theory},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387984887},
	year = {1998}
}


@misc{citeulike:2081200,
	abstract = {A popular account of the connection between random walks and electric
networks.},
	author = {Doyle, Peter  G.  and Snell, Laurie  J. },
	citeulike-article-id = {2081200},
	comment = {- Fundamental matrix of the absorbing chain
- Probabilistic interpretation of current and voltage},
	eprint = {math/0001057},
	keywords = {resistance},
	month = {Jan},
	priority = {2},
	title = {Random Walks and Electric Networks},
	url = {http://arxiv.org/abs/math/0001057},
	year = {2000}
}


@book{citeulike:2101795,
	abstract = {{<P>A macroscopic system consists of a tremendous number of microscopic atoms and molecules. In thermal equilibrium the state of such a system is uniquely defined, despite the fact that the microscopic particles behave quite randomly. This observation gives rise to the fundamental law of the statistical physics; it allows entropy to be defined and a framework for the theory to be constructed but cannot be derived form quantum mechanics or force laws. Introduction to Statistical Physics seeks to explain the laws of the macroscopic level to undergraduate students learning them for the first time. The first part of this book explains the essence of statistical physics without going into details such as Liouvilles theorem or ergodic theorem, which are difficult for beginners and unnecessary for actual application of the statistical mechanics. In the second part, statistical mechanics \'{i}s applied to various systems which look different but have the same mathematical structure, in particular, features applications to quantum dynamics, thermodynamics, Ising model and statistical dynamics of free spins. Advanced topics in phase transitions and dense gases conclude the text, plus helpful appendices. </P>}},
	author = {Yoshioka, Daijiro  },
	citeulike-article-id = {2101795},
	comment = {- Section 9.3, description of Onsager's solution},
	howpublished = {Hardcover},
	isbn = {3540286055},
	keywords = {book, ising, physics},
	month = {November},
	priority = {2},
	publisher = {Springer},
	title = {Statistical Physics: An Introduction},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/3540286055},
	year = {2006}
}


@article{citeulike:2101252,
	abstract = {The resistance distance r ij between two vertices v i and v j of a (connected, molecular) graph G is equal to the resistance between the respective two points of an electrical network, constructed so as to correspond to G, such that the resistance of any two adjacent points is unity. We show how the matrix elements r ij can be expressed in terms of the Laplacian eigenvalues and eigenvectors of G. In addition, we determine certain properties of the resistance matrix R=|| r ij||.},
	author = {Xiao, Wenjun   and Gutman, Ivan  },
	citeulike-article-id = {2101252},
	comment = {- formula 2 gives resistance in terms of pseudo-inverse of Laplacian of the graph (ideas/resistance/scratch.nb)},
	doi = {10.1007/s00214-003-0460-4},
	journal = {Theoretical Chemistry Accounts: Theory, Computation, and Modeling (Theoretica Chimica Acta)},
	keywords = {graph-theory, resistance},
	month = {November},
	number = {4},
	pages = {284--289},
	priority = {2},
	title = {Resistance distance and Laplacian spectrum},
	url = {http://dx.doi.org/10.1007/s00214-003-0460-4},
	volume = {110},
	year = {2003}
}


@book{citeulike:2100070,
	abstract = {{<P>This book examines in detail the correlations for the two-dimensional Ising model in the infinite volume or thermodynamic limit and the sub- and super- critical continuum scaling limits. Steady progress in recent years has been made in understanding the special mathematical features of certain exactly solvable models in statistical mechanics and quantum field theory, including the scaling limits of the 2-D Ising (lattice) model, and more generally, a class of 2-D quantum fields known as holonomic fields.</P> <P>New results have made it possible to obtain a detailed nonperturbative analysis of the multi-spin correlations. In particular, the book focuses on deformation analysis of the scaling functions of the Ising model. This self-contained work also includes discussions on Pfaffians, elliptic uniformization, the Grassmann calculus for spin representations, Weiner--Hopf factorization, determinant bundles, and monodromy preserving deformations.</P> <P>This work explores the Ising model as a microcosm of the confluence of interesting ideas in mathematics and physics, and will appeal to graduate students, mathematicians, and physicists interested in the mathematics of statistical mechanics and quantum field theory.</P>}},
	author = {Palmer, John  },
	citeulike-article-id = {2100070},
	howpublished = {Hardcover},
	isbn = {081764248X},
	keywords = {book, ising},
	month = {July},
	priority = {2},
	publisher = {{Birkh\"{a}user Boston}},
	title = {Planar Ising Correlations (Progress in Mathematical Physics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/081764248X},
	year = {2007}
}


@book{citeulike:2083121,
	abstract = {{The Handbook of Linear Algebra provides comprehensive coverage of linear algebra concepts, applications, and computational software packages in an easy-to-use handbook format. The esteemed international contributors guide you from the very elementary aspects of the subject to the frontiers of current research.  The book features an accessible layout of parts, chapters, and sections, with each section containing definition, fact, and example segments. The five main parts of the book encompass the fundamentals of linear algebra, combinatorial and numerical linear algebra, applications of linear algebra to various mathematical and nonmathematical disciplines, and software packages for linear algebra computations. Within each section, the facts (or theorems) are presented in a list format and include references for each fact to encourage further reading, while the examples illustrate both the definitions and the facts.  Linearization often enables difficult problems to be estimated by more manageable linear ones, making the Handbook of Linear Algebra essential reading for professionals who deal with an assortment of mathematical problems.}},
	author = {Hogben, Leslie  },
	citeulike-article-id = {2083121},
	comment = {- 1402 pages. Mathematica section. Linear algebra and graph theory.
- Sec 17, unitarily invariant norms (Ky-Fan, spectral)},
	howpublished = {Hardcover},
	isbn = {1584885106},
	keywords = {book, linear-algebra},
	month = {November},
	priority = {2},
	publisher = {{Chapman \& Hall/CRC}},
	title = {Handbook of Linear Algebra (Discrete Mathematics and Its Applications)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/1584885106},
	year = {2006}
}


@book{citeulike:623928,
	abstract = {{Combinatorics research, the branch of mathematics that deals with the study of discrete, usually finite, structures, covers a wide range of problems not only in mathematics but also in the biological sciences, engineering, and computer science. <i>The Handbook of Combinatorics</i> brings together almost every aspect of this enormous field and is destined to become a classic. Ronald L. Graham, Martin Gr?hel, and L\'{a}³ºl\'{o} \&\#338;¯v\'{a}³º, three of the world's leading combinatorialists, have compiled a selection of articles that cover combinatorics in graph theory, theoretical computer science, optimization, and convexity theory, plus applications in operations research, electrical engineering, statistical mechanics, chemistry, molecular biology, pure mathematics, and computer science.<br /> <br /> The 20 articles in Volume 1 deal with structures while the 24 articles in Volume 2 focus on aspects, tools, applications, and horizons.}},
	citeulike-article-id = {623928},
	comment = {Has Welsh's "Combinatorics in Statistical Physics" },
	howpublished = {Hardcover},
	isbn = {026207169X},
	keywords = {book, combinatorics},
	month = {January},
	priority = {2},
	publisher = {{The MIT Press}},
	title = {Handbook of Combinatorics: 2-volume set},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/026207169X},
	year = {1996}
}


@article{citeulike:2070809,
	address = {Cambridge, MA, USA},
	author = {Godsil, C. D.  and Gr\"otschel, M.  and Welsh, D. J. A. },
	citeulike-article-id = {2070809},
	comment = {Survey paper from the "Handbook of Combinatorics"

- 2d Ising partition function by counting perfect matchings in a weighted edge graph
- 2d Ising partition function by counting Eulerian subgraphs
- transfer matrix intro},
	isbn = {0262071711},
	keywords = {combinatorics, physics},
	pages = {1925--1954},
	priority = {2},
	publisher = {MIT Press},
	title = {Combinatorics in statistical physics},
	url = {http://portal.acm.org/citation.cfm?id=233228.233253},
	year = {1995}
}


@book{citeulike:1633838,
	author = {Finch, Steven  R. },
	citeulike-article-id = {1633838},
	comment = {Chapter 5:
- SAW Connective constants
- partition functions for hard-core models/independent sets
- low-high temperature expansions for Ising model, magnetic susceptibility
- equivalence of square-ice and 3-coloring on grid counting
},
	howpublished = {Hardcover},
	isbn = {0521818052},
	keywords = {book, saw},
	month = {August},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Mathematical Constants (Encyclopedia of Mathematics and its Applications)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521818052},
	year = {2003}
}


@misc{citeulike:2052018,
	abstract = {We give an alternative proof of the existence of the scaling limit of loop
erased random walk which does not use Lowner's differential equation.},
	author = {Kozma, Gady  },
	citeulike-article-id = {2052018},
	comment = {gives links to papers giving connections to other areas, like Potts model},
	eprint = {math.PR/0212338},
	keywords = {saw},
	month = {Dec},
	priority = {2},
	title = {Scaling limit of loop erased random walk - a naive approach},
	url = {http://arxiv.org/abs/math.PR/0212338},
	year = {2002}
}


@misc{citeulike:2046362,
	abstract = {We derive and prove exponential and form factor expansions of the row
correlation function and the diagonal correlation function of the two
dimensional Ising model.},
	author = {Lyberg, I.  and Mccoy, B. M. },
	citeulike-article-id = {2046362},
	comment = {Summary of formulas for 2d Ising correlation functions},
	eprint = {math-ph/0612051},
	keywords = {ising},
	month = {Dec},
	priority = {2},
	title = {Form factor expansion of the row and diagonal correlation functions of the two dimensional Ising model},
	url = {http://arxiv.org/abs/math-ph/0612051},
	year = {2006}
}


@misc{citeulike:2046292,
	abstract = {Form factor representation of the correlation function of the 2D Ising model
on a cylinder is generalized to the case of arbitrary disposition of
correlating spins. The magnetic susceptibility on a lattice, one of whose
dimensions (\$N\$) is finite, is calculated in both para- and ferromagnetic
regions of parameters of the model. The singularity structure of the
susceptibility in the complex temperature plane at finite values of \$N\$ and the
thermodynamic limit \$N\to\infty\$ are discussed.},
	author = {Bugrij, A. I.  and Lisovyy, O. },
	citeulike-article-id = {2046292},
	comment = {References for derivation of correlation function for finite/infinite lattices},
	eprint = {hep-th/0106270},
	keywords = {ising},
	month = {Aug},
	priority = {2},
	title = {Magnetic susceptibility of the 2D Ising model on a finite lattice},
	url = {http://arxiv.org/abs/hep-th/0106270},
	year = {2007}
}


@misc{citeulike:2043011,
	abstract = {Using exact expressions for the Ising form factors, we give a new very simple
proof that the spin-spin and disorder-disorder correlation functions are
governed by the Painlev\'e III non linear differential equation. We also show
that the generating function of the correlation functions of the descendents of
the spin and disorder operators is a \$N\$-soliton, \$N\to\infty\$, \$\tau\$-function
of the sinh-Gordon hierarchy. We discuss a relation of our approach to
isomonodromy deformation problems, as well as further possible generalizations.},
	author = {Babelon, Olivier   and Bernard, Denis  },
	citeulike-article-id = {2043011},
	comment = {finding two-point correlation functions is an open problem},
	eprint = {hep-th/9206003},
	keywords = {ising},
	month = {Jun},
	priority = {2},
	title = {From Form Factors to Correlation Functions: The Ising Model},
	url = {http://arxiv.org/abs/hep-th/9206003},
	year = {1992}
}


@book{citeulike:1400630,
	abstract = {{In this rigorous account the author studies both discrete-time and continuous-time chains. A distinguishing feature is an introduction to more advanced topics such as martingales and potentials, in the established context of Markov chains.  There are applications to simulation, economics, optimal control, genetics, queues and many other topics, and a careful selection of exercises and examples drawn both from theory and practice. This is an ideal text for seminars on random processes or for those that are more oriented towards applications, for advanced undergraduates or graduate students with some background in basic probability theory.}},
	author = {Norris, James  R. },
	citeulike-article-id = {1400630},
	comment = {- chapter 1 works out recurrence on 1d,2d,3d grids},
	howpublished = {Paperback},
	isbn = {0521633966},
	keywords = {book, saw},
	month = {July},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Markov Chains (Cambridge Series in Statistical and Probabilistic Mathematics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521633966},
	year = {1998}
}


@article{citeulike:667579,
	abstract = {Formulas are obtained for the mean first passage times (as well as their dispersion) in random walks from the origin to an arbitrary lattice point on a periodic space lattice with periodic boundary conditions. Generally this time is proportional to the number of lattice points.The number of distinct points visited after n steps on a k-dimensional lattice (with k  3) when n is large is a1n + a2n\&frac12; + a3 + a4n\&\#150;\&frac12; + . The constants a1 \&\#150; a4 have been obtained for walks on a simple cubic lattice when k = 3 and a1 and a2 are given for simple and face-centered cubic lattices. Formulas have also been obtained for the number of points visited r times in n steps as well as the average number of times a given point has been visited.The probability F(c) that a walker on a one-dimensional lattice returns to his starting point before being trapped on a lattice of trap concentration c is F(c) = 1 + [c/(1 \&\#150; c)] log c.Most of the results in this paper have been derived by the method of Green's functions. 


\&copy;1965 The American Institute of Physics},
	author = {Montroll, Elliott  W.  and Weiss, George  H. },
	citeulike-article-id = {667579},
	comment = {Applies Green function theory to find find-passage times for random walks on various lattices},
	doi = {10.1063/1.1704269},
	journal = {Journal of Mathematical Physics},
	keywords = {saw},
	number = {2},
	pages = {167--181},
	priority = {2},
	publisher = {AIP},
	title = {Random Walks on Lattices. II},
	url = {http://scitation.aip.org/getabs/servlet/GetabsServlet?prog=normal\&id=JMAPAQ000006000002000167000001\&idtype=cvips\&gifs=yes},
	volume = {6},
	year = {1965}
}


@book{citeulike:2018551,
	abstract = {{The aim of these notes is to link algorithmic problems arising in knot theory with statistical physics and classical combinatorics. Apart from the theory of computational complexity needed to deal with enumeration problems, introductions are given to several of the topics, such as combinatorial knot theory, randomized approximation models, percolation, and random cluster models.}},
	author = {Welsh, Dominic  },
	citeulike-article-id = {2018551},
	comment = {- Definitions of NP, \#P, \#P\_1
- SAW is \#P, but probably not \#P-complete
- Results on Ising model (partition function in terms of Euler cycles)
- Four interpretations of Tutte polynomial (also see other Welsh paper)},
	howpublished = {Paperback},
	isbn = {0521457408},
	keywords = {book, computational-complexity},
	month = {August},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Complexity: Knots, Colourings and Countings (London Mathematical Society Lecture Note Series)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521457408},
	year = {1993}
}


@misc{citeulike:2010109,
	abstract = {We address the question of convergence in the

loopy belief propagation (LBP) algorithm.},
	author = {Tatikonda, S.  and Jordan, M. },
	citeulike-article-id = {2010109},
	comment = {Gives (cryptic) definition of Gibbs distribution over countable set of variables},
	keywords = {graphical, message-passing},
	priority = {2},
	title = {Loopy belief propagation and Gibbs measures},
	url = {http://citeseer.ist.psu.edu/595409.html},
	year = {2002}
}


@misc{citeulike:141971,
	abstract = {The multivariate Tutte polynomial (known to physicists as the Potts-model
partition function) can be defined on an arbitrary finite graph G, or more
generally on an arbitrary matroid M, and encodes much important combinatorial
information about the graph (indeed, in the matroid case it encodes the full
structure of the matroid). It contains as a special case the familiar
two-variable Tutte polynomial -- and therefore also its one-variable
specializations such as the chromatic polynomial, the flow polynomial and the
reliability polynomial -- but is considerably more flexible. I begin by giving
an introduction to all these problems, stressing the advantages of working with
the multivariate version. I then discuss some questions concerning the complex
zeros of the multivariate Tutte polynomial, along with their physical
interpretations in statistical mechanics (in connection with the Yang--Lee
approach to phase transitions) and electrical circuit theory. Along the way I
mention numerous open problems. This survey is intended to be understandable to
mathematicians with no prior knowledge of physics.},
	author = {Sokal, Alan  D. },
	citeulike-article-id = {141971},
	comment = {- Kirkhoff's matrix-tree theorem explanation
- Any model can be mapped to hard-core local gas "polymer expansion" or "cluster expansion" (section V.7 of Simon)},
	eprint = {math.CO/0503607},
	keywords = {ising, resistance},
	month = {March},
	priority = {3},
	title = {The multivariate Tutte polynomial (alias Potts model) for graphs and matroids},
	url = {http://arxiv.org/abs/math.CO/0503607},
	year = {2005}
}


@article{citeulike:928328,
	abstract = {This is a tutorial review on the Potts model aimed at bringing out in an organized fashion the essential and important properties of the standard Potts model. Emphasis is placed on exact and rigorous results; but other aspects of the problem are also described to achieve a unified perspective. Topics reviewed include the mean-field theory; duality relations; series expansions; critical properties; experimental realizations; and the relationship of the Potts model with other lattice-statistical problems.},
	author = {Wu, F. Y. },
	citeulike-article-id = {928328},
	comment = {connection to resistor networks

Kirkhoff's result (evaluating resistance through spanning trees)

resistance between two nodes in terms of ratio of partition function of Potts model

a way to evaluate the number of spanning trees (complete self-avoiding walks) for any lattice},
	doi = {10.1103/RevModPhys.54.235},
	journal = {Reviews of Modern Physics},
	keywords = {ising},
	month = {January},
	number = {1},
	pages = {235+},
	priority = {2},
	publisher = {American Physical Society},
	title = {The Potts model},
	url = {http://dx.doi.org/10.1103/RevModPhys.54.235},
	volume = {54},
	year = {1982}
}


@misc{citeulike:2007830,
	abstract = {This article provides an introduction to Schramm(stochastic)-Loewner
evolution (SLE) and to its connection with conformal field theory, from the
point of view of its application to two-dimensional critical behaviour. The
emphasis is on the conceptual ideas rather than rigorous proofs.},
	author = {Cardy, John  },
	citeulike-article-id = {2007830},
	comment = {Alternative formulation of partition function in terms of loops (eq. 2), hexagonal ising model exploration (self-avoiding)},
	eprint = {cond-mat/0503313},
	keywords = {ising},
	month = {May},
	priority = {2},
	title = {SLE for theoretical physicists},
	url = {http://arxiv.org/abs/cond-mat/0503313},
	year = {2005}
}


@misc{citeulike:2008089,
	abstract = {If a student asks for an antiderivative of exp(x^2), there is a standard
reply: the answer is not an elementary function. But if a student asks for a
closed-form expression for the real root of x = cos(x), there is no standard
reply. We propose a definition of a closed-form expression for a number (as
opposed to a *function*) that we hope will become standard. With our
definition, the question of whether the root of x = cos(x) has a closed form
is, perhaps surprisingly, still open. We show that Schanuel's conjecture in
transcendental number theory resolves questions like this, and we also sketch
some connections with Tarski's problem of the decidability of the first-order
theory of the reals with exponentiation. Many (hopefully accessible) open
problems are described.},
	author = {Chow, Timothy  Y. },
	citeulike-article-id = {2008089},
	eprint = {math.NT/9805045},
	keywords = {math},
	month = {May},
	priority = {2},
	title = {What is a closed-form number?},
	url = {http://arxiv.org/abs/math.NT/9805045},
	year = {1998}
}


@misc{citeulike:1741264,
	abstract = {This article is a mini-review about electrical current flows in networks from
the perspective of statistical physics. We briefly discuss analytical methods
to solve the conductance of an arbitrary resistor network. We then turn to
basic results related to percolation: namely, the conduction properties of a
large random resistor network as the fraction of resistors is varied. We focus
on how the conductance of such a network vanishes as the percolation threshold
is approached from above. We also discuss the more microscopic current
distribution within each resistor of a large network. At the percolation
threshold, this distribution is multifractal in that all moments of this
distribution have independent scaling properties. We will discuss the meaning
of multifractal scaling and its implications for current flows in networks,
especially the largest current in the network. Finally, we discuss the relation
between resistor networks and random walks and show how the classic phenomena
of recurrence and transience of random walks are simply related to the
conductance of a corresponding electrical network.},
	author = {Redner, S. },
	citeulike-article-id = {1741264},
	comment = {- spin-spin correlation function in Potts model in terms of SAW's
- classical solution of resistance problem (symmetry argument, Fourier, and Kirkhoff's spanning tree polynomial)
- partition function of Ising model becomes proportional spanning tree polynomial for certain interaction strength in the q->0 limit
- delta-Y,Y-delta transforms to simplify networks},
	eprint = {0710.1105},
	keywords = {ising, resistance, saw},
	month = {Oct},
	priority = {2},
	title = {Fractal and Multifractal Scaling of Electrical Conduction in Random Resistor Networks},
	url = {http://arxiv.org/abs/0710.1105},
	year = {2007}
}


@article{citeulike:2007242,
	abstract = {Not Available},
	author = {Guttmann, A. J. },
	citeulike-article-id = {2007242},
	comment = {Relates to SAW critical exponents to ones for Ising model on a lattice},
	journal = {Journal of Physics A Mathematical General},
	keywords = {saw},
	month = {February},
	pages = {455--468},
	priority = {2},
	title = {On two-dimensional self-avoiding random walks},
	url = {http://adsabs.harvard.edu/cgi-bin/nph-bib\_query?bibcode=1984JPhA...17..455G},
	volume = {17},
	year = {1984}
}


@article{citeulike:2003010,
	abstract = {We give an algorithm for the enumeration of self-avoiding walks on the (anisotropic) square lattice. Application of the algorithm on a 1024 processor Intel Paragon supercomputer resulted in a 51 term series. For (isotropic) square lattice self-avoiding polygons, a related algorithm has produced a 90 term series. Careful analysis provides compelling evidence for simple rational values of the exponents in both the dominant and subdominant terms in the asymptotic form of the coefficients. We also advance compelling arguments \&ndash; but not a proof \&ndash; that the generating function for SAW is not differentiably finite. The corresponding result for SAP has recently been proved.},
	author = {Guttmann, A. J.  and Conway, A. R. },
	citeulike-article-id = {2003010},
	comment = {Survey of recent results on square lattice SAW},
	journal = {Annals of Combinatorics},
	keywords = {saw},
	month = {December},
	number = {3},
	pages = {319--345},
	priority = {2},
	title = {Square Lattice Self-Avoiding Walks and Polygons},
	url = {http://www.springerlink.com/content/yqkrqrrccba6rmv4
},
	volume = {5},
	year = {2001}
}


@book{citeulike:761351,
	abstract = {{The self-avoiding walk is a mathematical model that has important applications in statistical mechanics and polymer science. In spite of its simple definition - a lattice path that does not visit the same site more than once - it is difficult to analyze mathematically. This largely self-contained monograph provides the first unified account of the known rigorous results for the self-avoiding walk, with particular emphasis on its critical behavior. Its goals are to give an account of the current mathematical understanding of the model, to indicate some of the applications of the concept in physics and in chemistry, and to give an introduction to some of the nonrigorous methods used in those fields.  <P>Topics covered in the book include: the lace expansion and its application to the self-avoiding walk in more than four dimensions, where most issues are now resolved; an introduction to the nonrigourous scaling theory; classical work of Hammersley and others; a new exposition of Kesten's pattern theorem and its consequences; a discussion of the decay of the two-point function and its relation to probabilistic renewal theory; analysis of Monte Carlo methods that have been used to study the self-avoiding walk; the role of the self-avoiding walk in physical and chemical applications. Methods from combinatorics, probability theory, analysis, and mathematical physics play important roles.  <P>The book is highly accessible to both professional and graduate students in mathematics, physics and chemistry.}},
	author = {Madras, Neal   and Slade, Gordon  },
	citeulike-article-id = {761351},
	comment = {- 2.3, N-vector model has the same correlation function as self-avoiding walks as N->0
- Appendix A gives potential function results on regular random walk},
	howpublished = {Paperback},
	isbn = {0817638911},
	keywords = {saw},
	month = {August},
	priority = {2},
	publisher = {Birkhauser},
	title = {The Self-Avoiding Walk (Probability and its Applications)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0817638911},
	year = {1996}
}


@inproceedings{citeulike:1984136,
	address = {Philadelphia, PA, USA},
	author = {Martinelli, Fabio   and Sinclair, Alistair   and Weitz, Dror  },
	booktitle = {SODA '04: Proceedings of the fifteenth annual ACM-SIAM symposium on Discrete algorithms},
	citeulike-article-id = {1984136},
	comment = {details on phase transitions in spin systems},
	doi = {10.1002/rsa.v24:3},
	isbn = {089871558X},
	keywords = {mixing, saw},
	pages = {456--465},
	priority = {2},
	publisher = {Society for Industrial and Applied Mathematics},
	title = {Fast mixing for independent sets, colorings and other models on trees},
	url = {http://portal.acm.org/citation.cfm?id=982792.982857},
	year = {2004}
}


@article{citeulike:1983618,
	address = {Cambridge, MA, USA},
	author = {Yuille, A. L. },
	citeulike-article-id = {1983618},
	comment = {Convergent version of BP},
	doi = {10.1162/08997660260028674},
	issn = {0899-7667},
	journal = {Neural Comput.},
	keywords = {message-passing},
	month = {July},
	number = {7},
	pages = {1691--1722},
	priority = {2},
	publisher = {MIT Press},
	title = {CCCP algorithms to minimize the Bethe and Kikuchi free energies: convergent alternatives to belief propagation},
	url = {http://portal.acm.org/citation.cfm?id=638986},
	volume = {14},
	year = {2002}
}


@inproceedings{citeulike:1151015,
	abstract = {Belief propagation (BP) was only supposed to work for tree-like

networks but works surprisingly well in many applications involving

networks with loops, including turbo codes. However, there has

been little understanding of the algorithm or the nature of the

solutions it finds for general graphs.},
	author = {Yedidia, Jonathan  S.  and Freeman, William  T.  and Weiss, Yair  },
	booktitle = {NIPS},
	citeulike-article-id = {1151015},
	comment = {explanation of free energies in relation to BP},
	keywords = {message-passing},
	pages = {689--695},
	priority = {2},
	title = {Generalized Belief Propagation},
	url = {http://citeseer.ist.psu.edu/yedidia00generalized.html},
	year = {2000}
}


@inproceedings{citeulike:1970924,
	address = {Philadelphia, PA, USA},
	author = {Gamarnik, David   and Katz, Dmitriy  },
	booktitle = {SODA '07: Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms},
	citeulike-article-id = {1970924},
	comment = {mentioned by Nair/Tetali paper},
	isbn = {9780898716245},
	keywords = {saw},
	pages = {1245--1254},
	priority = {2},
	publisher = {Society for Industrial and Applied Mathematics},
	title = {Correlation decay and deterministic FPTAS for counting list-colorings of a graph},
	url = {http://portal.acm.org/citation.cfm?id=1283517},
	year = {2007}
}


@misc{citeulike:1968741,
	abstract = {This paper deals with the construction of a correlation decay tree
(hypertree) for interacting systems modeled using graphs (hypergraphs) that can
be used to compute the marginal probability of any vertex of interest. Local
message passing equations have been used for some time to approximate the
marginal probabilities in graphs but it is known that these equations are
incorrect for graphs with loops. In this paper we construct, for any finite
graph and a fixed vertex, a finite tree with appropriately defined boundary
conditions so that the marginal probability on the tree at the vertex matches
that on the graph. For several interacting systems, we show using our approach
that if there is very strong spatial mixing on an infinite regular tree, then
one has strong spatial mixing for any given graph with maximum degree bounded
by that of the regular tree. Thus we identify the regular tree as the worst
case graph, in a weak sense, for the notion of strong spatial mixing.},
	author = {Nair, Chandra   and Tetali, Prasad  },
	citeulike-article-id = {1968741},
	comment = {generalization of self-avoiding walk to multi-class problems},
	eprint = {math/0701494},
	keywords = {mixing, saw},
	month = {Feb},
	priority = {2},
	title = {The correlation decay (CD) tree and strong spatial mixing in multi-spin systems},
	url = {http://arxiv.org/abs/math/0701494},
	year = {2007}
}


@article{citeulike:151067,
	author = {Scott, Alexander   and Sokal, Alan  },
	citeulike-article-id = {151067},
	comment = {SAW tree characterization for hard model, cited by Weitz},
	doi = {10.1007/s10955-004-2055-4},
	issn = {0022-4715},
	journal = {Journal of Statistical Physics},
	keywords = {physics},
	month = {March},
	number = {5-6},
	pages = {1151--1261},
	priority = {2},
	publisher = {Kluwer Academic Publishers},
	title = {The Repulsive Lattice Gas, the Independent-Set Polynomial, and the Lovasz Local Lemma},
	url = {http://dx.doi.org/10.1007/s10955-004-2055-4},
	volume = {118},
	year = {2005}
}


@article{citeulike:1968465,
	address = {Philadelphia, PA, USA},
	author = {Dyer, Martin   and Frieze, Alan   and Jerrum, Mark  },
	citeulike-article-id = {1968465},
	comment = {Shows that MCMC counting must change O(n) vertices at each step for certain graphs (non-local alg), cited by Weitz},
	doi = {10.1137/S0097539701383844},
	issn = {0097-5397},
	journal = {SIAM J. Comput.},
	keywords = {graph-theory},
	number = {5},
	pages = {1527--1541},
	priority = {2},
	publisher = {Society for Industrial and Applied Mathematics},
	title = {On Counting Independent Sets in Sparse Graphs},
	url = {http://portal.acm.org/citation.cfm?id=586849.587021},
	volume = {31},
	year = {2002}
}


@inproceedings{citeulike:1962672,
	address = {New York, NY, USA},
	author = {Weitz, Dror  },
	booktitle = {STOC '06: Proceedings of the thirty-eighth annual ACM symposium on Theory of computing},
	citeulike-article-id = {1962672},
	comment = {- SAW algorithm for hard-core model},
	doi = {10.1145/1132516.1132538},
	isbn = {1595931341},
	keywords = {message-passing, saw},
	pages = {140--149},
	priority = {2},
	publisher = {ACM},
	title = {Counting independent sets up to the tree threshold},
	url = {http://portal.acm.org/citation.cfm?id=1132516.1132538},
	year = {2006}
}


@misc{citeulike:1962528,
	abstract = {We present a new local approximation algorithm for computing Maximum a
Posteriori (MAP) and log-partition function for arbitrary exponential family
distribution represented by a finite-valued pair-wise Markov random field
(MRF), say \$G\$. Our algorithm is based on decomposition of \$G\$ into {\em
appropriately} chosen small components; then computing estimates locally in
each of these components and then producing a {\em good} global solution. We
show that if the underlying graph \$G\$ either excludes some finite-sized graph
as its minor (e.g. Planar graph) or has low doubling dimension (e.g. any graph
with {\em geometry}), then our algorithm will produce solution for both
questions within {\em arbitrary accuracy}. We present a message-passing
implementation of our algorithm for MAP computation using self-avoiding walk of
graph. In order to evaluate the computational cost of this implementation, we
derive novel tight bounds on the size of self-avoiding walk tree for arbitrary
graph.


As a consequence of our algorithmic result, we show that the normalized
log-partition function (also known as free-energy) for a class of {\em regular}
MRFs will converge to a limit, that is computable to an arbitrary accuracy.},
	author = {Jung, Kyomin   and Shah, Devavrat  },
	citeulike-article-id = {1962528},
	comment = {Proof of SAW tree representation},
	eprint = {cs/0610111},
	keywords = {message-passing},
	month = {Oct},
	priority = {2},
	title = {Local approximate inference algorithms},
	url = {http://arxiv.org/abs/cs/0610111},
	year = {2007}
}


@inproceedings{ihler07b,
	author = {Ihler, A. },
	booktitle = {Proceedings of {UAI} 2007},
	citeulike-article-id = {1962515},
	keywords = {bibtex-import},
	month = {July},
	priority = {2},
	title = {Accuracy bounds for belief propagation},
	year = {2007}
}


@article{citeulike:1947450,
	abstract = {The purpose of this survey is to give an overview of results on spectra of infinite
graphs, emphasizing how contributions from different areas fit into this graphtheoretical
setting. For the moment, we point out the books by E. Seneta [113] and
by A. Figa-Talamanca and M. A. Picardello [45] as two examples. Among the variety
of books on the background in functional analysis and matrix theory, we emphasize
[1, 30, 41, 118, 129]. Our approach follows that of B. Mohar [89] and related
definitions of spectra, as they have been used in the mathematical fields mentioned
above. A different approach, due to A. Torgasev [119], will be only briefly discussed;
the reader is referred to the recent book [33].},
	author = {Mohar, Bojan   and Woess, Wolfgang  },
	citeulike-article-id = {1947450},
	keywords = {graph-theory},
	priority = {2},
	title = {A SURVEY ON SPECTRA OF INFINITE GRAPHS},
	url = {http://cat.inist.fr/?aModele=afficheN\&\#38;cpsidt=7344410},
	year = {1989}
}


@techreport{citeulike:1941186,
	abstract = {Local belief propagation rules of the sort proposed by Pearl (1988) are guaranteed to converge to the optimal beliefs for singly connected networks. Recently, a number of researchers have empirically demonstrated good performance of these same algorithms on networks with loops, but a theoretical understanding of this performance has yet to be achieved. Here we lay a foundation for an understanding of belief propagation in networks with loops. For networks with a single loop, we derive an...},
	author = {Weiss, Yair  },
	citeulike-article-id = {1941186},
	comment = {a way to correct belief propagation},
	keywords = {message-passing},
	number = {AIM-1616},
	priority = {2},
	title = {Belief Propagation and Revision in Networks with Loops},
	url = {http://citeseer.ist.psu.edu/weiss97belief.html},
	year = {1997}
}


@book{citeulike:141092,
	abstract = {{Information theory and inference, often taught separately, are here united in one entertaining textbook. These topics lie at the heart of many exciting areas of contemporary science and engineering - communication, signal processing, data mining, machine learning, pattern recognition, computational neuroscience, bioinformatics, and cryptography.   This textbook introduces theory in tandem with applications. Information theory is taught alongside practical communication systems, such as arithmetic coding for data compression and sparse-graph codes for error-correction. A toolbox of inference techniques, including message-passing algorithms, Monte Carlo methods, and variational approximations, are developed alongside applications of these tools to clustering, convolutional codes, independent component analysis, and neural networks.   The final part of the book describes the state of the art in error-correcting codes, including low-density parity-check codes, turbo codes, and digital fountain codes -- the twenty-first century standards for satellite communications, disk drives, and data broadcast.    Richly illustrated, filled with worked examples and over 400 exercises, some with detailed solutions, David MacKay's groundbreaking book is ideal for self-learning and for undergraduate or graduate courses. Interludes on crosswords, evolution, and sex provide entertainment along the way.   In sum, this is a textbook on information, communication, and coding for a new generation of students, and an unparalleled entry point into these subjects for professionals in areas as diverse as computational biology, financial engineering, and machine learning.}},
	author = {Mackay, David  J. C. },
	citeulike-article-id = {141092},
	comment = {intro to sum-product for error-correcting codes},
	howpublished = {Hardcover},
	isbn = {0521642981},
	keywords = {message-passing},
	month = {June},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Information Theory, Inference \& Learning Algorithms},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521642981},
	year = {2002}
}


@misc{citeulike:1912568,
	abstract = {Presentation on Error Correcting Codes},
	author = {Ryan, William  },
	citeulike-article-id = {1912568},
	comment = {detailed examples of decoding},
	keywords = {ecoc, notes},
	priority = {2},
	title = {Low-Density Parity-Check Codes}
}


@article{citeulike:1901016,
	address = {Cambridge, MA, USA},
	author = {Heskes, Tom  },
	citeulike-article-id = {1901016},
	comment = {Explanation of free energy},
	doi = {10.1162/0899766041941943},
	issn = {0899-7667},
	journal = {Neural Comput.},
	keywords = {message-passing},
	month = {November},
	number = {11},
	pages = {2379--2413},
	priority = {2},
	publisher = {MIT Press},
	title = {On the uniqueness of loopy belief propagation fixed points},
	url = {http://portal.acm.org/citation.cfm?id=1039398.1039405},
	volume = {16},
	year = {2004}
}


@article{citeulike:1900042,
	abstract = {show how a hidden Markov model can be expressed as a Gibbs distribution. I review my Gibbs distribution training algorithm (a 'Gibbs Machine' rather than a 'Boltzmann Machine'), which I use to perform gradient ascent on the relative entropy between the Gibbs distribution and the data distribution. I demonstrate how this reduces to elementary matrix computations of exactly the same form as encountered in the Baum-Welch re-estimation method. Although this toy problem is amenable to Baum-Welch re-estimation, the same cannot be said of non-tree-like Markov models. In such cases I propose that a hybrid Baum-Welch/Gibbs Machine optimisation scheme should be used.},
	author = {Luttrell, Steve  },
	citeulike-article-id = {1900042},
	comment = {Gradient based training method for HMM's, Hammerley-Clifford expansion},
	keywords = {hmm},
	priority = {2},
	title = {The Gibbs Machine applied to hidden Markov model problems},
	year = {1989}
}


@misc{citeulike:1896529,
	abstract = {1 Foundation of statistical mechanics. 1
1.1 Introduction. . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
1.2 Program of statistical mechanics. . . . . . . . . . . . . . . . . . . 4
1.3 States of a system. . . . . . . . . . . . . . . . . . . . . . . . . . . 5
1.4 Averages. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 10
1.5 Thermal equilibrium. . . . . . . . . . . . . . . . . . . . . . . . . . 14
1.6 Entropy and temperature. . . . . . . . . . . . . . . . . . . . . . . 16
1.7 Laws of thermodynamics. . . . . . . . . . . . . . . . . . . . . . . 19
1.8 Problems for chapter 1 . . . . . . . . . . . . . . . . . . . . . . . . 20
2 The canonical ensemble 23
2.1 Introduction. . . . . . . . . . . . . . . . . . . . . . . . . . . . . 23
2.2 Energy and entropy and temperature. . . . . . . . . . . . . . . . 26
2.3 Work and pressure. . . . . . . . . . . . . . . . . . . . . . . . . . . 28
2.4 Helmholtz free energy. . . . . . . . . . . . . . . . . . . . . . . . . 31
2.5 Changes in variables. . . . . . . . . . . . . . . . . . . . . . . . . . 32
2.6 Properties of the Helmholtz free energy. . . . . . . . . . . . . . . 33
2.7 Energy °uctuations. . . . . . . . . . . . . . . . . . . . . . . . . . 35
2.8 A simple example. . . . . . . . . . . . . . . . . . . . . . . . . . . 37
2.9 Problems for chapter 2 . . . . . . . . . . . . . . . . . . . . . . . . 39
3 Variable number of particles 43
3.1 Chemical potential. . . . . . . . . . . . . . . . . . . . . . . . . . 43
3.2 Examples of the use of the chemical potential. . . . . . . . . . . . 46
3.3 Di®erential relations and grand potential. . . . . . . . . . . . . . 48
3.4 Grand partition function. . . . . . . . . . . . . . . . . . . . . . . 50
3.5 Overview of calculation methods. . . . . . . . . . . . . . . . . . . 55
3.6 A simple example. . . . . . . . . . . . . . . . . . . . . . . . . . . 57
3.7 Ideal gas in ¯rst approximation. . . . . . . . . . . . . . . . . . . . 58
3.8 Problems for chapter 3 . . . . . . . . . . . . . . . . . . . . . . . . 64
4 Statistics of independent particles. 67
4.1 Introduction. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 67
4.2 Boltzmann gas again. . . . . . . . . . . . . . . . . . . . . . . . . 74
III
IV CONTENTS
4.3 Gas of poly-atomic molecules. . . . . . . . . . . . . . . . . . . . . 77
4.4 Degenerate gas. . . . . . . . . . . . . . . . . . . . . . . . . . . . . 79
4.5 Fermi gas. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 80
4.6 Boson gas. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 85
4.7 Problems for chapter 4 . . . . . . . . . . . . . . . . . . . . . . . . 87
5 Fermions and Bosons 89
5.1 Fermions in a box. . . . . . . . . . . . . . . . . . . . . . . . . . . 89
5.2 Bosons in a box. . . . . . . . . . . . . . . . . . . . . . . . . . . . 106
5.3 Bose-Einstein condensation. . . . . . . . . . . . . . . . . . . . . . 113
5.4 Problems for chapter 5 . . . . . . . . . . . . . . . . . . . . . . . . 115
6 Density matrix formalism. 119
6.1 Density operators. . . . . . . . . . . . . . . . . . . . . . . . . . . 119
6.2 General ensembles. . . . . . . . . . . . . . . . . . . . . . . . . . . 123
6.3 Maximum entropy principle. . . . . . . . . . . . . . . . . . . . . . 125
6.4 Equivalence of entropy de¯nitions for canonical ensemble. . . . . 134
6.5 Problems for chapter 6 . . . . . . . . . . . . . . . . . . . . . . . . 136
A Solutions to selected problems. 139
A.1 Solutions for chapter 1. . . . . . . . . . . . . . . . . . . . . . . 139
A.2 Solutions for chapter 2. . . . . . . . . . . . . . . . . . . . . . . 147
A.3 Solutions for chapter 3 . . . . . . . . . . . . . . . . . . . . . . . 152
A.4 Solutions for chapter 4. . . . . . . . . . . . . . . . . . . . . . . 157
A.5 Solutions for chapter 5. . . . . . . . . . . . . . . . . . . . . . . 161
A.6 Solutions for chapter 6. . . . . . . . . . . . . . . . . . . . . . . 167},
	author = {Jansen, Henri  J. F. },
	citeulike-article-id = {1896529},
	comment = {Section 1.3 derives multiplicity function for Ising model, show no net magnetization},
	keywords = {ising, notes},
	priority = {2},
	title = {Statistical Mechanics},
	year = {2007}
}


@book{citeulike:1891903,
	abstract = {{Intended for beginning graduate students or advanced undergraduates, this text covers the statistical basis of thermodynamics, including examples from solid-state physics. It also treats some topics of more recent interest such as phase transitions and non-equilibrium phenomena. The presentation introducesmodern concepts, such as the thermodynamic limit and equivalence of Gibbs ensembles, and uses simple models (ideal gas, Einstein solid, simple paramagnet) and many examples to make the mathematical ideas clear. Frequently used mathematical methods are discussed in detail and reviews in an appendix.   The book begins with a review of statistical methods and classical thermodynamics, making it suitable for students from a variety of backgrounds. Statistical mechanics is formulated in the microcanonical ensemble; some simple arguments and many examples are used to construct th canonical and grand-canonical ensembles. The discussion of quantum statistical mechanics includes Bose and Fermi ideal gases, the Bose-Einstein condensation, blackbody radiation, phonons and magnons. The van der Waals and Curoe-Weiss phenomenological models are used to illustrate the classical theories of phase transitions and critical phenomena; modern developments are intorducted with discussions of the Ising model, scaling theory, and renormalization-group ideas. The book concludes withy two chapters on nonequilibrium phenomena: one using Boltzmann's kinetic approach, and the other based on stochastic methods.   Exercises at the end of each chapter are an integral part of the course, clarifying and extending topics discussed in the text. Hints and solutions can be found on the author's web site.}},
	author = {Salinas, Silvio  },
	citeulike-article-id = {1891903},
	comment = {- Ch.13 gives introduction to Ising model
- Mean-field BP, Curie-Weiss approximations, Onsager's exact solution},
	howpublished = {Hardcover},
	isbn = {0387951199},
	keywords = {ising, notes},
	month = {February},
	priority = {2},
	publisher = {Springer},
	title = {Introduction to Statistical Physics},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387951199},
	year = {2001}
}


@inbook{citeulike:1891894,
	author = {Rasaiah},
	citeulike-article-id = {1891894},
	comment = {Derivation of 1D Ising model with and without magnetic field, mean field approximation},
	journal = {ncyclopedia of Chemical Physics and Physical Chemistry},
	keywords = {ising},
	priority = {2},
	title = {Statistical mechanics of strongly interacting systems.}
}


@misc{citeulike:1278871,
	abstract = {The combination of the compactness of networks and their complex
architectures results in a variety of critical effects dramatically different
from those in cooperative systems on lattices. In the last few years,
researchers have made important steps toward understanding the qualitatively
new critical phenomena in complex networks. We review the results, concepts,
and methods of this rapidly developing field. Here we mostly consider two
closely related classes of these critical phenomena, namely structural phase
transitions in the network architectures and transitions in cooperative models
on networks as substrates. We also discuss systems where a network and
interacting agents on it influence each other. We overview a wide range of
critical phenomena in equilibrium and growing networks including the birth of
the giant connected component, percolation, k-core percolation, phenomena near
epidemic thresholds, condensation transitions, critical phenomena in spin
models placed on networks, synchronisation, and self-organized criticality
effects in interacting systems on networks. We also discuss strong finite size
effects in these systems and highlight open problems and perspectives.},
	author = {Dorogovtsev, S. N.  and Goltsev, A. V.  and Mendes, J. F. F. },
	citeulike-article-id = {1278871},
	comment = {- Chapter VI gives background on Ising models on trees.
- Second order transition for Bethe lattice.
- Belief propagation exact for fully connected graph?
},
	eprint = {0705.0010},
	keywords = {ising},
	month = {May},
	priority = {2},
	title = {Critical phenomena in complex networks},
	url = {http://arxiv.org/abs/0705.0010},
	year = {2007}
}


@inproceedings{citeulike:1886596,
	abstract = {We address the question of convergence in the sum-product algorithm. Specifically, we relate convergence of the sum-product algorithm to the existence of a weak limit for a sequence of Gibbs measures defined on the associated computation tree. Using tools from the theory of Gibbs measures we develop easily testable sufficient conditions for convergence. The failure of convergence of the sum-product algorithm implies the existence of multiple phases for the associated Gibbs specification. These results give new insight into the mechanics of the algorithm.},
	author = {Tatikonda, S. C. },
	booktitle = {Information Theory Workshop, 2003. Proceedings. 2003 IEEE},
	citeulike-article-id = {1886596},
	comment = {uses theory of Gibb's measures?},
	journal = {Information Theory Workshop, 2003. Proceedings. 2003 IEEE},
	keywords = {graphical, message-passing},
	pages = {222--225},
	priority = {2},
	title = {Convergence of the sum-product algorithm},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1216735},
	year = {2003}
}


@misc{citeulike:1886585,
	abstract = {We derive novel conditions that guarantee convergence of the Sum-Product
algorithm (also known as Loopy Belief Propagation or simply Belief Propagation)
to a unique fixed point, irrespective of the initial messages. The
computational complexity of the conditions is polynomial in the number of
variables. In contrast with previously existing conditions, our results are
directly applicable to arbitrary factor graphs (with discrete variables) and
are shown to be valid also in the case of factors containing zeros, under some
additional conditions. We compare our bounds with existing ones, numerically
and, if possible, analytically. For binary variables with pairwise
interactions, we derive sufficient conditions that take into account local
evidence (i.e., single variable factors) and the type of pair interactions
(attractive or repulsive). It is shown empirically that this bound outperforms
existing bounds.},
	author = {Mooij, Joris  M.  and Kappen, Hilbert  J. },
	citeulike-article-id = {1886585},
	comment = {- Incorporates strong local evidence
- Derives Birkhoff condition -Tanh[Log[x]/4]=(1-sqrt(phi))/(1+sqrt(phi))},
	eprint = {cs/0504030},
	keywords = {graphical, message-passing},
	month = {May},
	priority = {2},
	title = {Sufficient conditions for convergence of the Sum-Product Algorithm},
	url = {http://arxiv.org/abs/cs/0504030},
	year = {2007}
}


@inbook{citeulike:1883689,
	author = {Rasaiah},
	citeulike-article-id = {1883689},
	comment = {Exact solution of 1d Ising model},
	journal = {Encyclopedia of Chemical Physics and Physical Chemistry},
	keywords = {ising, notes, physics},
	priority = {2},
	title = {Statistical Mechanics of Strongly Interacting Systems: liquids and solids},
	year = {2001}
}


@inproceedings{citeulike:1849294,
	address = {New York, NY, USA},
	author = {Achlioptas, Dimitris  },
	booktitle = {PODS '01: Proceedings of the twentieth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems},
	citeulike-article-id = {1849294},
	comment = {SVD optimal under any rotationally invariant norm},
	doi = {10.1145/375551.375608},
	isbn = {1581133618},
	keywords = {linear-algebra, random-projections},
	pages = {274--281},
	priority = {2},
	publisher = {ACM},
	title = {Database-friendly random projections},
	url = {http://portal.acm.org/citation.cfm?id=375608},
	year = {2001}
}


@inproceedings{citeulike:1849039,
	address = {New York, NY, USA},
	author = {Li, Ping  },
	booktitle = {KDD '07: Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining},
	citeulike-article-id = {1849039},
	doi = {10.1145/1281192.1281241},
	isbn = {9781595936097},
	keywords = {linear-algebra},
	pages = {440--449},
	priority = {2},
	publisher = {ACM},
	title = {Very sparse stable random projections for dimension reduction in <i>l</i>\&\#945; (0 \&\#60;\&\#945; \&\#8804; 2) norm},
	url = {http://portal.acm.org/citation.cfm?id=1281241},
	year = {2007}
}


@inproceedings{citeulike:1823831,
	address = {New York, NY, USA},
	author = {Li, Ping   and Hastie, Trevor  J.  and Church, Kenneth  W. },
	booktitle = {KDD '06: Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining},
	citeulike-article-id = {1823831},
	comment = {random projections where matrices are {-1,0,1}, sparser than {0,1} random matrices},
	doi = {10.1145/1150402.1150436},
	isbn = {1595933395},
	keywords = {linear-algebra},
	pages = {287--296},
	priority = {2},
	publisher = {ACM Press},
	title = {Very sparse random projections},
	url = {http://portal.acm.org/citation.cfm?id=1150436},
	year = {2006}
}


@article{citeulike:1781845,
	abstract = {We consider the problem of giving explicit spectral bounds for time inhomogeneous Markov chains on a finite state space. We give bounds that apply when there exists a probability [pi] such that each of the different steps corresponds to a nice ergodic Markov kernel with stationary measure [pi]. For instance, our results provide sharp bounds for models such as semi-random transpositions and semi-random insertions (in these cases [pi] is the uniform probability on the symmetric group).},
	author = {Saloff-Coste, L.  and Zuniga, J. },
	citeulike-article-id = {1781845},
	comment = {Borel-Doeblin theorem (bounding the distance of incorrectly initialized inhomogenous Markov chain from invariant distribution)

Bounds on eigenvalues},
	doi = {10.1016/j.spa.2006.11.004},
	journal = {Stochastic Processes and their Applications},
	keywords = {ergodicity, hmm},
	month = {August},
	number = {8},
	pages = {961--979},
	priority = {2},
	title = {Convergence of some time inhomogeneous Markov chains via spectral techniques},
	url = {http://dx.doi.org/10.1016/j.spa.2006.11.004},
	volume = {117},
	year = {2007}
}


@misc{citeulike:546887,
	abstract = {A popular account of the connection between random walks and electric
networks.},
	author = {Doyle, Peter  G.  and Snell, Laurie  J. },
	citeulike-article-id = {546887},
	comment = {- Harmonic function theory
- Dirichlet's problem
- trapping Markov chains
---=note-separator=---
Polya's theorem using electrical network.

Interpretation of current and voltage through random walks},
	eprint = {math.PR/0001057},
	keywords = {hmm, physics},
	month = {Jan},
	priority = {2},
	title = {Random Walks and Electric Networks},
	url = {http://arxiv.org/abs/math.PR/0001057},
	year = {2000}
}


@misc{citeulike:1757041,
	abstract = {Introduction

In a remarkable recent paper, Viswanath (1998) has considered the large n
behaviour of solutions to the `random Fibonacci recurrence'

x n+1 = \Sigma x n \Sigma x n\Gamma1 ; (1.1)
where the signs are chosen independently and with equal probabilities, and x 0 =
x 1 = 1. Computer experiments, as in figure 1, show exponential growth with
n. The problem of large n behaviour of (1.1) has been mentioned at least since
1963, when Furstenberg (1963) established exponential growth with...},
	author = {Embree, Mark   and Trefethen, Lloyd  N. },
	citeulike-article-id = {1757041},
	keywords = {hmm, linear-algebra},
	priority = {2},
	title = {Growth and Decay of Random Fibonacci Sequences},
	url = {http://citeseer.ist.psu.edu/388624.html}
}


@article{citeulike:670015,
	abstract = {In principle, the exponential of a matrix could be computed in many ways. Methods involving approximation theory, differential equations, the matrix eigenvalues, and the matrix characteristic polynomial have been proposed. In practice, consideration of computational stability and efficiency indicates that some of the methods are preferable to others, but that none are completely satisfactory.},
	author = {Moler, Cleve   and Van Loan, Charles  },
	citeulike-article-id = {670015},
	journal = {SIAM Review},
	keywords = {linear-algebra},
	number = {4},
	pages = {801--836},
	priority = {2},
	title = {Nineteen Dubious Ways to Compute the Exponential of a Matrix},
	url = {http://links.jstor.org/sici?sici=0036-1445\%28197810\%2920\%3A4\%3C801\%3ANDWTCT\%3E2.0.CO\%3B2-W},
	volume = {20},
	year = {1978}
}


@article{citeulike:1731703,
	author = {Mease},
	citeulike-article-id = {1731703},
	journal = {JMLR},
	keywords = {boosting},
	priority = {2},
	title = {Evidence Contrary to the Statistical View of Boosting},
	year = {2007}
}


@book{citeulike:1728048,
	abstract = {{This book provides an integrated treatment of the theory of nonnegative matrices and some related classes of positive matrices, concentrating on connections with game theory, combinatorics, inequalities, optimization and mathematical economics. The authors have chosen the wide variety of applications, which include price fixing, scheduling, and the fair division problem, both for their elegant mathematical content and for their accessibility to students with minimal preparation. They present many new results in matrix theory for the first time in book form, while they present more standard topics in a novel fashion.  The treatment is rigorous and almost all results are proved completely. These new results and applications will be of great interest to researchers in linear programming, statistics, and operations research.  The minimal prerequisites also make the book accessible to first year graduate students.}},
	author = {Bapat, R. B.  and Raghavan, T. E. S. },
	citeulike-article-id = {1728048},
	comment = {Full indecomposability, Frobenius normal form (final classes), doubly stochastic matrices as convex combinations of permutation matrices, convergence of weak transitive closure (I+A+A^2...)},
	howpublished = {Hardcover},
	isbn = {0521571677},
	keywords = {hmm, linear-algebra},
	month = {March},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Nonnegative Matrices and Applications (Encyclopedia of Mathematics and its Applications)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521571677},
	year = {1997}
}


@book{citeulike:1061890,
	abstract = {{This book gives a thorough, up-to-date treatment of the behavior of numerical algorithms in finite precision arithmetic. It combines algorithmic derivations, perturbation theory, and rounding error analysis, all enlivened by historical perspective and informative quotations. The coverage of the first edition has been expanded and updated, involving numerous improvements. Two new chapters treat symmetric indefinite systems and skew-symmetric systems, and nonlinear systems and Newton's method. Twelve new sections include coverage of additional error bounds for Gaussian elimination, rank revealing LU factorizations, weighted and constrained least squares problems, and the fused multiply-add operation found on some modern computer architectures. This new edition is a suitable reference for an advanced course and can also be used at all levels as a supplementary text from which to draw examples, historical perspective, statements of results, and exercises. In addition the thorough indexes and extensive, up-to-date bibliography are in a readily accessible form.}},
	author = {Higham, Nicholas  J. },
	citeulike-article-id = {1061890},
	comment = {Pseudo-spectra, hump of non-normal matrices},
	howpublished = {Hardcover},
	isbn = {0898715210},
	keywords = {linear-algebra, numerical-analysis},
	month = {August},
	priority = {2},
	publisher = {{SIAM: Society for Industrial and Applied Mathematics}},
	title = {Accuracy and Stability of Numerical Algorithms},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0898715210},
	year = {2002}
}


@article{citeulike:1725176,
	author = {Trefethen, Lloyd  N.  and Embree, Mark  },
	citeulike-article-id = {1725176},
	keywords = {linear-algebra, numerical-analysis},
	priority = {2},
	title = {Spectra and pseudospectra: The behavior of nonnormal matrices and operators, Review},
	url = {http://www.ams.org/bull/2007-44-02/S0273-0979-06-01128-1/home.html}
}


@article{citeulike:1718527,
	abstract = {If A an n x n nonnegative, irreducible matrix, then there exists [mu](A) > 0, and a positive vector x such that maxjaijxj = [mu](A)xi, i = 1, 2,..., n. Furthermore, [mu](A) is the maximum geometric mean of a circuit in the weighted directed graph corresponding to A. This theorem, which we refer to as the max version of the Perron-Frobenius Theorem, is well-known in the context of matrices over the max algebra and also in the context of matrix scalings. In the present work, which is partly expository, we bring out the intimate connection between this result and the Perron-Frobenius theory. We present several proofs of the result, some of which use the Perron-Frobenius Theorem. Structure of max eigenvalues and max eigenvectors is described. Possible ways to unify the Perron-Frobenius Theorem and its max version are indicated. Some inequalities for [mu](A) are proved.},
	author = {Bapat, R. B. },
	booktitle = {Proceedings of the Sixth Conference of the International Linear Algebra Society},
	citeulike-article-id = {1718527},
	comment = {Several proofs of max Perron theorem

Max-plus and regular Perron eigenvalues as limits of each other

Way to compute max-plus eigen-basis

Characterization of eigenvalues from Frobenius normal form},
	doi = {10.1016/S0024-3795(97)10057-X},
	journal = {Linear Algebra and its Applications},
	keywords = {maxplus},
	month = {May},
	pages = {3--18},
	priority = {2},
	title = {A max version of the Perron-Frobenius theorem},
	url = {http://dx.doi.org/10.1016/S0024-3795(97)10057-X},
	volume = {275-276},
	year = {1998}
}


@article{citeulike:1717419,
	abstract = {The eigenvalue problem for an irreducible nonnegative matrix \$A = [a\_{ij}]\$ in the max algebra system is \$A\otimes x = \lambda x\$, where \$(A \otimes x)\_i ={\mathop{{\max}\_j}}(a\_{ij}x\_j)\$ and \$\lambda\$ turns out to be the maximum circuit geometric mean, \$\mu(A)\$. A power method algorithm is given to compute \$\mu(A)\$ and eigenvector \$x\$. The algorithm is developed by using results on the convergence of max powers of \$A\$, which are proved using nonnegative matrix theory. In contrast to an algorithm developed in [4], this new method works for any irreducible nonnegative \$A\$, and calculates eigenvectors in a simpler and more efficient way. Some asymptotic formulas relating \$\mu(A)\$, the spectral radius and norms are also given.},
	author = {Elsner, Ludwig   and Driessche},
	citeulike-article-id = {1717419},
	doi = {10.1016/S0024-3795(98)10171-4},
	journal = {Linear Algebra and its Applications},
	keywords = {maxplus},
	month = {December},
	pages = {17--32},
	priority = {2},
	title = {On the power method in max algebra},
	url = {http://dx.doi.org/10.1016/S0024-3795(98)10171-4},
	volume = {302-303},
	year = {1999}
}


@unpublished{citeulike:1717089,
	abstract = {This is a survey about the title question, written for people who (like the author) see logic as for-
bidding, esoteric, and remote from their usual concerns. Beginning with a crash course on Zermelo-
Fraenkel set theory, it discusses oracle independence; natural proofs; independence results of Razborov,
Raz, DeMillo-Lipton, Sazanov, and others; and obstacles to proving P vs. NP independent of strong
logical theories. It ends with some philosophical musings on when one should expect a mathematical
question to have a definite answer},
	author = {Aaronson},
	citeulike-article-id = {1717089},
	keywords = {computational-complexity},
	priority = {2},
	title = {Is P Versus NP Formally Independent?}
}


@techreport{citeulike:1716177,
	author = {Bouillard and Gaujal},
	citeulike-article-id = {1716177},
	comment = {Suggested by Mairresse, example of arbitrarily long coupling time, conventional PF theorem},
	keywords = {maxplus},
	priority = {2},
	title = {Coupling time of a (max, plus) matrix},
	year = {2000}
}


@article{citeulike:1701998,
	abstract = {A discrete-event system is a system whose behavior can be described by means of a set of time-consuming activities, performed according to a prescribed ordering. Events correspond to starting or ending some activity. An analogy between linear systems and a class of discrete-event systems is developed. Following this analogy, such discrete-event systems can be viewed as linear, in the sense of an appropriate algebra. The periodical behavior of closed discrete-event systems, i.e., involving a set of repeatedly performed activities, can be totally characterized by solving an eigenvalue and eigenvector equation in this algebra. This problem is numerically solved by an efficient algorithm which basically consists of finding the shortest paths from one node to all other nodes in a graph. The potentiality of this approach for the performance evaluation of flexible manufacturing systems is emphasized; the case of a flowshop-like production process is analyzed in detail.},
	author = {Cohen, G.  and Dubois, D.  and Quadrat, J.  and Viot, M. },
	booktitle = {Automatic Control, IEEE Transactions on},
	citeulike-article-id = {1701998},
	comment = {Referenced for length of transient regime},
	journal = {Automatic Control, IEEE Transactions on},
	keywords = {maxplus},
	number = {3},
	pages = {210--220},
	priority = {2},
	title = {A linear-system-theoretic view of discrete-event processes and its use for performance evaluation in manufacturing},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1103925},
	volume = {30},
	year = {1985}
}


@inproceedings{citeulike:1698253,
	abstract = {Some communication or manufacturing models can be represented as \&ldquo;linear\&rdquo; systems in the (max,+) algebra. In this paper, the author studies matrices in the (max,+) algebra. The author introduces a new tool for describing the deterministic spectral behaviour of matrices of size 3\&times;3. It consists in a graphical representation of eigenvectors and domains of attraction in the projective space},
	author = {Mairesse, J. },
	booktitle = {Decision and Control, 1994., Proceedings of the 33rd IEEE Conference on},
	citeulike-article-id = {1698253},
	journal = {Decision and Control, 1994., Proceedings of the 33rd IEEE Conference on},
	keywords = {maxplus},
	pages = {2615--2620 vol.3},
	priority = {2},
	title = {A graphical representation for matrices in the (max,+) algebra},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=411540},
	volume = {3},
	year = {1994}
}


@inproceedings{citeulike:1683887,
	abstract = {A novel upper bound is presented for the second largest eigenvalue of a finite reversible time-homogeneous Markov chain as a function of three parameters, namely the smallest transition probability, the underlying structure of the chain, and the skewness of the equilibrium distribution. Simulated annealing (SA) is an example of a probabilistic algorithm that is widely used for solving combinatorial optimization problems, wherein the transition probabilities are controlled by a certain temperature parameter <e1>T</e1>\&gt;0. Using the results presented, it is possible to bound the time constant of convergence of SA to equilibrium at any fixed temperature <e1>T</e1>\&gt;0, and also to study the temperature asymptotics, namely the growth of this bound as <e1>T</e1>\&rarr;0},
	author = {Desai, M. P.  and Rao, V. B. },
	booktitle = {Circuits and Systems, 1990., IEEE International Symposium on},
	citeulike-article-id = {1683887},
	comment = {Conditions for real eigenvalues of transition matrix, restatement of PF},
	journal = {Circuits and Systems, 1990., IEEE International Symposium on},
	keywords = {hmm},
	pages = {1211--1214 vol.2},
	priority = {2},
	title = {A new eigenvalue bound for reversible Markov chains with applications to the temperature-asymptotics of simulated annealing},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=112347},
	year = {1990}
}


@article{citeulike:1646560,
	abstract = {The theory of computational complexity has some interesting links to physics, in particular to quantum computing and statistical mechanics. The article contains an informal introduction to this theory and its links to physics},
	author = {Mertens, S. },
	booktitle = {Computing in Science \& Engineering},
	citeulike-article-id = {1646560},
	comment = {informal introduction to complexity classes and Ising model},
	journal = {Computing in Science \& Engineering},
	keywords = {ising},
	number = {3},
	pages = {31--47},
	priority = {2},
	title = {Computational complexity for physicists},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=998639},
	volume = {4},
	year = {2002}
}


@article{citeulike:1646531,
	abstract = {History of Ising model},
	author = {Hayes},
	citeulike-article-id = {1646531},
	comment = {history of Ising model},
	keywords = {ising},
	priority = {2},
	title = {The world in a Spin}
}


@misc{citeulike:1645787,
	abstract = {The q-state Potts model can be defined on an arbitrary finite graph, and its
partition function encodes much important information about that graph,
including its chromatic polynomial, flow polynomial and reliability polynomial.
The complex zeros of the Potts partition function are of interest both to
statistical mechanicians and to combinatorists. I give a pedagogical
introduction to all these problems, and then sketch two recent results: (a)
Construction of a countable family of planar graphs whose chromatic zeros are
dense in the whole complex q-plane except possibly for the disc |q-1| \&lt; 1. (b)
Proof of a universal upper bound on the q-plane zeros of the chromatic
polynomial (or antiferromagnetic Potts-model partition function) in terms of
the graph's maximum degree.},
	author = {Sokal, Alan  D. },
	citeulike-article-id = {1645787},
	comment = {proof that number of colorings of a graph with k colors is a restriction of a polynomial (chromatic polynomial),
 real zeros correspond to unsatisfiable graph, shows that no zeros for loopless graphs,
bounds the region where zeros must lie in complex plane,
why care about complex plane?},
	eprint = {cond-mat/9910503},
	keywords = {ising},
	month = {Oct},
	priority = {2},
	title = {Chromatic Polynomials, Potts Models and All That},
	url = {http://arxiv.org/abs/cond-mat/9910503},
	year = {1999}
}


@misc{citeulike:1645733,
	abstract = {We present a numerical method to evaluate partition functions and associated
correlation functions of inhomogeneous 2--D classical spin systems and 1--D
quantum spin systems. The method is scalable and has a controlled error. We
illustrate the algorithm by calculating the finite--temperature properties of
bosonic particles in 1--D optical lattices, as realized in current experiments.},
	author = {Murg, V.  and Verstraete, F.  and Cirac, J. I. },
	citeulike-article-id = {1645733},
	comment = {2d grid partition function as contraction of 4d tensors},
	eprint = {cond-mat/0501493},
	keywords = {ising},
	month = {Jan},
	priority = {2},
	title = {Efficient evaluation of partition functions of frustrated and inhomogeneous spin systems},
	url = {http://arxiv.org/abs/cond-mat/0501493},
	year = {2005}
}


@article{citeulike:1645728,
	abstract = {The numerical transfer matrix for the partition function of discrete lattice models is generalized to allow the calculation of the density of states Ω( E ); and the restricted density of states Ω( E ; M ). Given Ω( E ; M ) the partition function is expressed as a polynomial in the variables  x = e β h  and  y = e -β . These algorithms are illustrated with calculations for the Ising model on finite square lattices. The zeros of the partition function are examined in both the complex  x  and  y  planes. Finite size scaling analysis of the zeros leads to very accurate estimates for the critical temperature and critical exponents.},
	author = {Creswick, R. J. },
	citeulike-article-id = {1645728},
	comment = {Formulates density of states, partition in terms of trace},
	doi = {10.1103/PhysRevE.52.R5735},
	journal = {Physical Review E},
	keywords = {ising},
	month = {December},
	number = {6},
	pages = {R5735+},
	priority = {2},
	publisher = {American Physical Society},
	title = {Transfer matrix for the restricted canonical and microcanonical ensembles},
	url = {http://dx.doi.org/10.1103/PhysRevE.52.R5735},
	volume = {52},
	year = {1995}
}


@misc{citeulike:1505132,
	abstract = {We consider the recursive equation ``x(n+1)=A(n)x(n)'' where x(n+1) and x(n)
are column vectors of size k and where A(n) is an irreducible random matrix of
size k x k. The matrix-vector multiplication in the (max,+) algebra is defined
by (A(n)x(n))\_i= max\_j [ A(n)\_{ij} +x(n)\_j ]. This type of equation can be used
to represent the evolution of Stochastic Event Graphs which include cyclic
Jackson Networks, some manufacturing models and models with general blocking
(such as Kanban). Let us assume that the sequence (A(n))\_n is i.i.d or more
generally stationary and ergodic. The main result of the paper states that the
system couples in finite time with a unique stationary regime if and only if
there exists a set of matrices C such that P {A(0) in C} \&gt; 0, and the matrices
in C have a unique periodic regime.},
	author = {Mairesse, Jean  },
	citeulike-article-id = {1505132},
	comment = {nonhomogenous products in max-plus algebra},
	eprint = {0707.3672},
	keywords = {maxplus},
	month = {Jul},
	priority = {2},
	title = {Products of irreducible random matrices in the (Max,+) Algebra},
	url = {http://arxiv.org/abs/0707.3672},
	year = {2007}
}


@misc{citeulike:1626090,
	abstract = {The standard theorem for regular stochastic matrices is generalized to
matrices with no sign restriction on the entries. The condition that column
sums be equal to 1 is kept, but the regularity condition is replaced by a
condition on the \$\ell\_1\$-distances between columns.},
	author = {\&\#x106;urgus, Branko   and Jewett, Robert  I. },
	citeulike-article-id = {1626090},
	comment = {Extends Perron-Frobenius to non-primitive matrices where Doeblin ergodicity coefficient is smaller than 1, some properties of Doeblin coefficient},
	eprint = {0709.0309},
	keywords = {ergodicity, hmm, linear-algebra},
	month = {Sep},
	priority = {2},
	title = {Somewhat stochastic matrices},
	url = {http://arxiv.org/abs/0709.0309},
	year = {2007}
}


@misc{citeulike:1634027,
	abstract = {this paper. The link between the weak convergence and the
epigraph convergence used in convex analysis is done.
The Cramer transform used in the large deviation literature is defined as
the composition of the Laplace transform by the logarithm by the Fenchel
transform. It transforms convolution into inf-convolution. Probabilistic results
about processes with independent increments are then transformed into
similar results on dynamic programming equations. Cramer transform gives
new insight on...},
	author = {Akian, M.  and Quadrat, J.  and Viot, M. },
	citeulike-article-id = {1634027},
	comment = {linked from Baez},
	keywords = {maxplus},
	priority = {2},
	title = {Duality between probability and optimization},
	url = {http://citeseer.ist.psu.edu/383978.html},
	year = {1997}
}


@misc{citeulike:1633256,
	abstract = {A theory of additive Markov chains with long-range memory, proposed earlier
in Phys. Rev. E 68, 06117 (2003), is developed and used to describe statistical
properties of long-range correlated systems. The convenient characteristics of
such systems, a memory function, and its relation to the correlation properties
of the systems are examined. Various methods for finding the memory function
via the correlation function are proposed. The inverse problem (calculation of
the correlation function by means of the prescribed memory function) is also
solved. This is demonstrated for the analytically solvable model of the system
with a step-wise memory function.},
	author = {Melnyk, S. S.  and Usatenko, O. V.  and Yampol\&\#x27;skii, V. A.  and Apostolov, S. S.  and Mayzelis, Z. A. },
	citeulike-article-id = {1633256},
	comment = {same as Rosenberg's mixture of parents model?
---=note-separator=---
Mentions Fourier transform as a method to generate random sequences with long range interactions},
	eprint = {physics/0603171},
	month = {Mar},
	priority = {2},
	title = {Memory functions and Correlations in Additive Binary Markov Chains},
	url = {http://arxiv.org/abs/physics/0603171},
	year = {2006}
}


@article{citeulike:1204751,
	author = {Mitzenmacher},
	citeulike-article-id = {1204751},
	comment = {derivation of power law for web graphs, power law as the solution to word length optimization (Mandelbrot), monkeys on typewriters resulting in lognormal/powerlaw, double pareto as an exponential mixture of log-normals (good fit to letter frequencies)},
	priority = {2},
	title = {A Brief History of Generative Models for Power Law and Lognormal Distributions},
	url = {http://projecteuclid.org/DPubS?service=UI\&\#38;version=1.0\&\#38;verb=Display\&\#38;handle=euclid.im/1089229510}
}


@article{PhysRevE.54.220,
	author = {Perline, Richard  },
	citeulike-article-id = {1633142},
	journal = {Phys. Rev. E},
	keywords = {bibtex-import},
	number = {1},
	pages = {220--223},
	priority = {2},
	publisher = {American Physical Society},
	title = {Zipf's law, the central limit theorem, and the random division of the unit interval},
	volume = {54},
	year = {1996}
}


@techreport{citeulike:1632669,
	abstract = {The Goemans-Williamson randomized algorithm guarantees a high-quality approximation to the Max-Cut problem,
but the cost associated with such an approximation can be excessively high for large-scale problems due to the need for
solving an expensive semidefinite relaxation. In order to achieve better practical performance, we propose an alternative,
rank-two relaxation and develop a specialized version of the Goemans-Williamson technique. The proposed approach leads
to continuous optimization...},
	address = {Atlanta, GA},
	author = {Burer, S.  and Monteiro, R. D. C.  and Zhang, Y. },
	citeulike-article-id = {1632669},
	comment = {intro to SDP relaxation for max-cut,
rank-2 relaxation non-convex but efficient, references to Ising literature
---=note-separator=---
application of max-cut to Ising ground-state problem},
	priority = {2},
	title = {Rank-Two Relaxation Heuristics for Max-Cut and Other Binary Quadratic Programs},
	url = {http://citeseer.ist.psu.edu/burer00ranktwo.html},
	year = {2000}
}


@article{citeulike:1632470,
	author = {Cipra, Barry  A. },
	citeulike-article-id = {1632470},
	comment = {NP-complete for non-planar graphs},
	priority = {2},
	title = {The Ising model is NP-complete},
	url = {http://www.siam.org/pdf/news/654.pdf}
}


@book{citeulike:1628558,
	abstract = {{Here is a valuable text and research tool for scientists and engineers who use or work with theory and computation associated with practical problems relating to Markov chains and queuing networks, economic analysis, or mathematical programming. Originally published in 1979, this new edition adds material that updates the subject relative to developments from 1979 to 1993. Theory and applications of nonnegative matrices are blended here, and extensive references are included in each area. You will be led from the theory of positive operators via the Perron-Frobenius theory of nonnegative matrices and the theory of inverse positivity, to the widely used topic of M-matrices.}},
	author = {Berman, Abraham   and Plemmons, Robert  J. },
	citeulike-article-id = {1628558},
	comment = {some new tools of analyzing markov chains?},
	howpublished = {Paperback},
	isbn = {0898713218},
	month = {January},
	priority = {3},
	publisher = {{Society for Industrial  Mathematics}},
	title = {Nonnegative Matrices in the Mathematical Sciences (Classics in Applied Mathematics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0898713218},
	year = {1987}
}


@article{citeulike:1628369,
	abstract = {In this paper, we develop a measure-theoretic version of the junction tree algorithm to compute desired marginals of a product function. We reformulate the problem in a measure-theoretic framework, where the desired marginals are viewed as corresponding conditional expectations of a product of random variables. We generalize the notions of independence and junction trees to collections of /spl sigma/-fields on a space with a signed measure. We provide an algorithm to find such a junction tree when one exists. We also give a general procedure to augment the /spl sigma/-fields to create independencies, which we call "lifting." This procedure is the counterpart of the moralization and triangulation procedure in the conventional generalized distributive law (GDL) framework, in order to guarantee the existence of a junction tree. Our procedure includes the conventional GDL procedure as a special case. However, it can take advantage of structures at the atomic level of the sample space to produce junction tree-based algorithms for computing the desired marginals that are less complex than those GDL can discover, as we argue through examples. Our formalism gives a new way by which one can hope to find low-complexity algorithms for marginalization problems.},
	author = {Pakzad, P.  and Anantharam, V. },
	booktitle = {Information Theory, IEEE Transactions on},
	citeulike-article-id = {1628369},
	comment = {using structure of clique probability distributions, discovers optimal structure for block turbo codes},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {hmm},
	number = {6},
	pages = {1132--1155},
	priority = {2},
	title = {A new look at the generalized distributive law},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1302294},
	volume = {50},
	year = {2004}
}


@article{citeulike:1628183,
	author = {Golan},
	citeulike-article-id = {1628183},
	comment = {definitions of semirings,
traveling salesman as solution of polynomial,
maxplus for belief propagation
},
	keywords = {hmm, linear-algebra},
	priority = {2},
	title = {Some recent applications of semiring theory}
}


@article{citeulike:1626216,
	author = {Fall and Quadrat},
	citeulike-article-id = {1626216},
	comment = {defines product forms},
	keywords = {linear-algebra},
	priority = {2},
	title = {About Min-Plus Product Forms},
	year = {1998}
}


@book{citeulike:669570,
	abstract = {{Linear algebra and matrix theory have long been fundamental tools in mathematical disciplines as well as fertile fields for research. In this book the authors present classical and recent results of matrix analysis that have proved to be important to applied mathematics. Facts about matrices, beyond those found in an elementary linear algebra course, are needed to understand virtually any area of mathematical science, but the necessary material has appeared only sporadically in the literature and in university curricula. As interest in applied mathematics has grown, the need for a text and reference offering a broad selection of topics in matrix theory has become apparent, and this book meets that need.    This volume reflects two concurrent views of matrix analysis. First, it encompasses topics in linear algebra that have arisen out of the needs of mathematical analysis. Second, it is an approach to real and complex linear algebraic problems that does not hesitate to use notions from analysis. Both views are reflected in its choice and treatment of topics.}},
	author = {Horn, Roger  A.  and Johnson, Charles  R. },
	citeulike-article-id = {669570},
	comment = {Theorem on uniqueness of positive square root},
	howpublished = {Paperback},
	isbn = {0521386322},
	keywords = {linear-algebra},
	month = {February},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Matrix Analysis},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521386322},
	year = {1990}
}


@inproceedings{citeulike:1611186,
	abstract = {The asymptotic properties of inhomogeneous products in the max-plus algebra context have been investigated. In particular, for products involving matrices with the same unique critical circuit, we have obtained some sufficiency conditions under which the rank of the final product matrix is less than or equal to the length of the critical circuit of the matrices in the product. For a product comprising matrices with the same unique critical circuit of length 1, the asymptotic rank is 1},
	author = {Shue, L.  and Anderson, B. D. O.  and Dey, S. },
	booktitle = {American Control Conference, 1998. Proceedings of the 1998},
	citeulike-article-id = {1611186},
	comment = {conditions on convergence to rank 1},
	journal = {American Control Conference, 1998. Proceedings of the 1998},
	keywords = {hmm, linear-algebra, maxplus},
	pages = {1909--1913 vol.3},
	priority = {2},
	title = {On steady-state properties of certain max-plus products},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=707354},
	volume = {3},
	year = {1998}
}


@misc{citeulike:1611063,
	abstract = {The resistance between arbitrary two nodes in a resistor network is obtained
in terms of the eigenvalues and eigenfunctions of the Laplacian matrix
associated with the network. Explicit formulas for two-point resistances are
deduced for regular lattices in one, two, and three dimensions under various
boundary conditions including that of a Moebius strip and a Klein bottle. The
emphasis is on lattices of finite sizes. We also deduce summation and product
identities which can be used to analyze large-size expansions of two-and-higher
dimensional lattices.},
	author = {Wu, F. Y. },
	citeulike-article-id = {1611063},
	comment = {formula for resistance between two nodes},
	eprint = {math-ph/0402038},
	keywords = {hmm, linear-algebra, resistance},
	month = {Feb},
	priority = {2},
	title = {Theory of resistor networks: The two-point resistance},
	url = {http://arxiv.org/abs/math-ph/0402038},
	year = {2004}
}


@misc{citeulike:1611061,
	abstract = {this paper we'll formulate the results in
terms of random walks, and mostly restrict our attention to the undirected
case.

2 L. Lov'asz},
	author = {Lov\&aacute;sz, L\&aacute;szl\&oacute;  },
	citeulike-article-id = {1611061},
	comment = {- Mixing time, first access time, harmonic functions, commute time is proportional to resistance. Connection to differential equations
- Simple explanation of electric connection},
	keywords = {hmm, linear-algebra},
	priority = {2},
	title = {Random Walks on Graphs: A Survey},
	url = {http://citeseer.ist.psu.edu/125830.html}
}


@book{citeulike:1606649,
	abstract = {{This completely revised second edition presents an introduction to statistical pattern recognition.  Pattern recognition in general covers a wide range of problems: it is applied to engineering problems, such as character readers and wave form analysis as well as to brain modeling in biology and psychology.  Statistical decision and estimation, which are the main subjects of this book, are regarded as fundamental to the study of pattern recognition.  This book is appropriate as a text for introductory courses in pattern recognition and as a reference book for workers in the field.  Each chapter contains computer projects as well as exercises.}},
	author = {Fukunaga, Keinosuke  },
	citeulike-article-id = {1606649},
	comment = {integration of gaussians, linear algebra primer},
	howpublished = {Hardcover},
	isbn = {0122698517},
	keywords = {linear-algebra},
	month = {September},
	priority = {2},
	publisher = {{Academic Press}},
	title = {Introduction to Statistical Pattern Recognition, Second Edition (Computer Science and Scientific Computing Series)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0122698517},
	year = {1990}
}


@book{citeulike:873540,
	abstract = {{The dramatic growth in practical applications for machine learning over the last ten years has been accompanied by many important developments in the underlying algorithms and techniques. For example, Bayesian methods have grown from a specialist niche to become mainstream, while graphical models have emerged as a general framework for describing and applying probabilistic techniques. The practical applicability of Bayesian methods has been greatly enhanced by the development of a range of approximate inference algorithms such as variational Bayes and expectation propagation, while new models based on kernels have had a significant impact on both algorithms and applications. This completely new textbook reflects these recent developments while providing a comprehensive introduction to the fields of pattern recognition and machine learning. It is aimed at advanced undergraduates or first-year PhD students, as well as researchers and practitioners. No previous knowledge of pattern recognition or machine learning concepts is assumed. Familiarity with multivariate calculus and basic linear algebra is required, and some experience in the use of probabilities would be helpful though not essential as the book includes a self-contained introduction to basic probability theory. The book is suitable for courses on machine learning, statistics, computer science, signal processing, computer vision, data mining, and bioinformatics. Extensive support is provided for course instructors, including more than 400 exercises, graded according to difficulty. Example solutions for a subset of the exercises are available from the book web site, while solutions for the remainder can be obtained by instructors from the publisher. The book is supported by a great deal of additional material, and the reader is encouraged to visit the book web site for the latest information. A forthcoming companion volume will deal with practical aspects of pattern recognition and machine learning, and will include free software implementations of the key algorithms along with example data sets and demonstration programs. Christopher Bishop is Assistant Director at Microsoft Research Cambridge, and also holds a Chair in Computer Science at the University of Edinburgh. He is a Fellow of Darwin College Cambridge, and was recently elected Fellow of the Royal Academy of Engineering. The author's previous textbook "Neural Networks for Pattern Recognition" has been widely adopted.}},
	author = {Bishop, Christopher  M. },
	citeulike-article-id = {873540},
	comment = {Appendix on linear algebra},
	howpublished = {Hardcover},
	isbn = {0387310738},
	keywords = {book, linear-algebra, machine-learning},
	month = {August},
	priority = {2},
	publisher = {Springer},
	title = {Pattern Recognition and Machine Learning (Information Science and Statistics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387310738},
	year = {2006}
}


@misc{citeulike:1606566,
	abstract = {The affine rank minimization problem consists of finding a matrix of minimum
rank that satisfies a given system of linear equality constraints. Such
problems have appeared in the literature of a diverse set of fields including
system identification and control, Euclidean embedding, and collaborative
filtering. Although specific instances can often be solved with specialized
algorithms, the general affine rank minimization problem is NP-hard. In this
paper, we show that if a certain restricted isometry property holds for the
linear transformation defining the constraints, the minimum rank solution can
be recovered by solving a convex optimization problem, namely the minimization
of the nuclear norm over the given affine space. We present several random
ensembles of equations where the restricted isometry property holds with
overwhelming probability. The techniques used in our analysis have strong
parallels in the compressed sensing framework. We discuss how affine rank
minimization generalizes this pre-existing concept and outline a dictionary
relating concepts from cardinality minimization to those of rank minimization.},
	author = {Recht, Benjamin   and Fazel, Maryam   and Parrilo, Pablo  A. },
	citeulike-article-id = {1606566},
	comment = {- relations between different norms, formulating constrained min rank as semi-definite programming
- Table 1, vector vs. matrix norms, l1=nuclear
- eq 2.1 matrix norm inequalities},
	eprint = {0706.4138},
	keywords = {linear-algebra, optimization},
	month = {Jun},
	priority = {2},
	title = {Guaranteed Minimum-Rank Solutions of Linear Matrix Equations via Nuclear Norm Minimization},
	url = {http://arxiv.org/abs/0706.4138},
	year = {2007}
}


@book{citeulike:1606564,
	abstract = {{". . . an authentic magnum opus worth much more than its weight in gold!"-IEEE Transactions on Automatic Control, from a review of the First Edition<br>   "The best book I've seen on the subject of Kalman filtering . . . Reading other books on Kalman filters and not this one could make you a very dangerous Kalman filter engineer."-Amazon.com, from a review of the First Edition<br>   In this practical introduction to Kalman filtering theory and applications, authors Grewal and Andrews draw upon their decades of experience to offer an in-depth examination of the subtleties, common problems, and limitations of estimation theory as it applies to real-world situations. They provide many illustrative examples drawn from an array of application areas including GPS-aided INS, the modeling of gyros and accelerometers, inertial navigation, and freeway traffic control. In addition, they share many hard-won lessons about, and original methods for, designing, implementing, validating, and improving Kalman filters, including techniques for:<br>   * Representing the problem in a mathematical model<br>   * Analyzing estimator performance as a function of model parameters<br>   * Implementing the mechanization equations in numerically stable algorithms<br>   * Assessing computational requirements<br>   * Testing the validity of results<br>   * Monitoring filter performance in operation<br>   As the best way to understand and master a technology is to observe it in action, Kalman Filtering: Theory and Practice Using MATLAB(r), Second Edition includes companion software in MATLAB(r), providing users with an opportunity to experience first hand the filter's workings and its limitations.<br>   This updated and revised edition of Grewal and Andrews's classic guide is an indispensable working resource for engineers and computer scientists involved in the design of aerospace and aeronautical systems, global positioning and radar tracking systems, power systems, and biomedical instrumentation.   <p>   An Instructor's Manual presenting detailed solutions to all the problems in the book is available from the Wiley editorial department.}},
	author = {Grewal, Mohinder  S.  and Andrews, Angus  P. },
	citeulike-article-id = {1606564},
	comment = {types of sparse matrices},
	howpublished = {Hardcover},
	isbn = {0471392545},
	keywords = {linear-algebra},
	month = {January},
	priority = {2},
	publisher = {Wiley-Interscience},
	title = {Kalman Filtering : Theory and Practice Using MATLAB},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0471392545},
	year = {2001}
}


@misc{citeulike:1590871,
	abstract = {Three algorithms for the model reduction of large-scale, continuous-time, timeinvariant,
linear, dynamical systems with a sparse or structured transition matrix
and a small number of inputs and outputs are described. They rely on low rank approximations
to the controllability and observability Gramians, which can eciently
be computed by ADI based iterative low rank methods. The rst two model reduction
methods are closely related to the well-known square root method and Schur
method, which are...},
	author = {Penzl, T. },
	citeulike-article-id = {1590871},
	comment = {State space reduction techniques in large dynamical systems},
	keywords = {hmm},
	priority = {2},
	title = {Algorithms for model reduction of large dynamical systems},
	url = {http://citeseer.ist.psu.edu/penzl99algorithms.html},
	year = {1999}
}


@misc{citeulike:1590787,
	abstract = {In this paper we prove exponential asymptotic stability for discrete
time filters for signals arising as solutions of d-dimensional stochastic difference
equations. The observation process is the signal corrupted by an
additive white noise of su\#ciently small variance. The model for the signal
admits non-ergodic processes. We show that almost surely, the total
variation distance between the optimal filter and an incorrectly initialized
filter converges to 0 exponentially fast as time
...},
	author = {Budhiraja, A.  and Ocone, D. },
	citeulike-article-id = {1590787},
	comment = {Shows that exponential rate of convergence decreases exponentially as noise goes to 0},
	keywords = {ergodicity, hmm},
	priority = {2},
	title = {Exponential stability in discrete time filtering for non-ergodic signals},
	url = {http://citeseer.ist.psu.edu/budhiraja99exponential.html},
	year = {1999}
}


@misc{citeulike:1586736,
	abstract = {According to a 1975 result of T. Kaijser, if some nonvanishing product of
hidden Markov model (HMM) stepping matrices is subrectangular, and the
underlying chain is aperiodic, the corresponding \$\alpha\$-chain has a unique
invariant limiting measure \$\lambda\$. Here the \$\alpha\$-chain
\$\{\alpha\_n\}=\{(\alpha\_{ni})\}\$ is given by \[\alpha\_{ni}=P(X\_n=i|
Y\_n,Y\_{n-1},...),\] where \$\{(X\_n,Y\_n)\}\$ is a finite state HMM with unobserved
Markov chain component \$\{X\_n\}\$ and observed output component \$\{Y\_n\}\$. This
defines \$\{\alpha\_n\}\$ as a stochastic process taking values in the probability
simplex. It is not hard to see that \$\{\alpha\_n\}\$ is itself a Markov chain.
The stepping matrices \$M(y)=(M(y)\_{ij})\$ give the probability that
\$(X\_n,Y\_n)=(j,y)\$, conditional on \$X\_{n-1}=i\$. A matrix is said to be
subrectangular if the locations of its nonzero entries forms a cartesian
product of a set of row indices and a set of column indices. Kaijser's result
is based on an application of the Furstenberg--Kesten theory to the random
matrix products \$M(Y\_1)M(Y\_2)... M(Y\_n)\$. In this paper we prove a slightly
stronger form of Kaijser's theorem with a simpler argument, exploiting the
theory of e chains.},
	author = {Kochman, Fred   and Reeds, Jim  },
	citeulike-article-id = {1586736},
	comment = {Gives Kaijser's result under weaker condition (some sequence of partial products of stepping matrices converges to rank 1 matrix)},
	eprint = {math/0702248v1},
	keywords = {ergodicity, hmm},
	month = {Feb},
	priority = {2},
	title = {A simple proof of Kaijser's unique ergodicity result for hidden Markov \$\alpha\$-chains},
	url = {http://arxiv.org/abs/math/0702248v1},
	year = {2007}
}


@article{citeulike:1584478,
	abstract = {A nonstationary Markov chain is weakly ergodic if the dependence of the state distribution on the starting state vanishes as time tends to infinity. A chain is strongly ergodic if it is weakly ergodic and converges in distribution. In this paper we show that the two ergodicity concepts are equivalent for finite chains under rather general (and widely verifiable) conditions. We discuss applications to probabilistic analyses of general search methods for combinatorial optimization problems (simulated annealing).},
	author = {Anily, Shoshana   and Federgruen, Awi  },
	citeulike-article-id = {1584478},
	comment = {Definitions of weak/strong ergodicity for Markov chains
references for early study of ergodicity for nonstationary chains
Condition for weakly ergodic non-stationary Markov chain to be strongly ergodic.

Modifies the condition for strong ergodicity (sum of differences of eigenvectors converges) to not require explicit form of eigenvectors},
	journal = {Operations Research},
	number = {6},
	pages = {867--874},
	priority = {2},
	title = {Ergodicity in Parametric Nonstationary Markov Chains: An Application to Simulated Annealing Methods},
	url = {http://links.jstor.org/sici?sici=0030-364X\%28198711\%2F12\%2935\%3A6\%3C867\%3AEIPNMC\%3E2.0.CO\%3B2-G},
	volume = {35},
	year = {1987}
}


@article{citeulike:1581468,
	author = {Pflug, G.  and Schachermayer, W. },
	citeulike-article-id = {1581468},
	comment = {Induces distance on distributions from metric on state space (Wasserstein metric). Show that ergodic matrix implies there exist Wasserstein metric under which multiplication is a contraction, best contraction in this metric determined by second eigenvalue},
	journal = {Journal of Applied Probability},
	keywords = {ergodicity, hmm},
	number = {4},
	pages = {850--860},
	priority = {2},
	title = {Coefficients of Ergodicity for Stochastically Monotone Markov Chains},
	url = {http://links.jstor.org/sici?sici=0021-9002\%28199212\%2929\%3A4\%3C850\%3ACOEFSM\%3E2.0.CO\%3B2-\%23},
	volume = {29},
	year = {1992}
}


@article{citeulike:1581308,
	abstract = {The weak ergodic theorems of mathematical demography state that the age distribution of a closed population is asymptotically independent of the initial distribution. In this paper, we provide a new proof of the weak ergodic theorem of the multistate population model with continuous time. The main tool to attain this purpose is a theory of multiplicative processes, which was mainly developed by Garrett Birkhoff, who showed that ergodic properties generally hold for an appropriate class of multiplicative processes. First, we construct a general theory of multiplicative processes on a Banach lattice. Next, we formulate a dynamical model of a multistate population and show that its evolution operator forms a multiplicative process on the state space of the population. Subsequently, we investigate a sufficient condition that guarantees the weak ergodicity of the multiplicative process. Finally, we prove the weak and strong ergodic theorems for the multistate population and resolve the consistency problem.},
	author = {Inaba, H. },
	citeulike-article-id = {1581308},
	comment = {Birkhoff's theory from lattice theory standpoint},
	issn = {0025-5564},
	journal = {Math Biosci},
	keywords = {ergodicity, hmm},
	month = {October},
	number = {2},
	pages = {195--219},
	priority = {2},
	title = {Weak ergodicity of population evolution processes.},
	url = {http://view.ncbi.nlm.nih.gov/pubmed/2520198},
	volume = {96},
	year = {1989}
}


@article{citeulike:1581066,
	author = {Kaijser, Thomas  },
	citeulike-article-id = {1581066},
	comment = {shows that conditional distribution P(Y\_n|X1,...,X\_n) converges if
1. sequence Y\_i is ergodic
2. some product of stepping matrices is subrectangular},
	journal = {The Annals of Probability},
	keywords = {ergodicity, hmm},
	number = {4},
	pages = {677--696},
	priority = {2},
	title = {A Limit Theorem for Partially Observed Markov Chains},
	url = {http://links.jstor.org/sici?sici=0091-1798\%28197508\%293\%3A4\%3C677\%3AALTFPO\%3E2.0.CO\%3B2-O},
	volume = {3},
	year = {1975}
}


@article{citeulike:1581045,
	author = {Hartfiel, D. J. },
	citeulike-article-id = {1581045},
	comment = {definition of irreducible, fully indecomposable},
	journal = {Proceedings of the American Mathematical Society},
	keywords = {ergodicity, hmm},
	number = {2},
	pages = {388--393},
	priority = {2},
	title = {A Simplified Form for Nearly Reducible and Nearly Decomposable Matrices},
	url = {http://links.jstor.org/sici?sici=0002-9939\%28197002\%2924\%3A2\%3C388\%3AASFFNR\%3E2.0.CO\%3B2-U},
	volume = {24},
	year = {1970}
}


@book{citeulike:1578529,
	abstract = {{Infinite products of matrices are used in nonhomogeneous Markov chains, Markov set-chains, demographics, probabilistic automata, production and manpower systems, tomography, and fractals. More recent results have been obtained in computer design of curves and surfaces.  <P>This book puts together much of the basic work on infinite products of matrices, providing a primary source for such work. This will eliminate the rediscovery of known results in the area, and thus save considerable time for researchers who work with infinite products of matrices. In addition, two chapters are included to show how infinite products of matrices are used in graphics and in systems work.}},
	author = {Hartfiel, Darald  J. },
	citeulike-article-id = {1578529},
	comment = {Convergence in Hilbert's metric, in quotient spaces, conditions on convergence of infinite products},
	howpublished = {Hardcover},
	isbn = {9810246285},
	keywords = {ergodicity},
	month = {January},
	priority = {2},
	publisher = {{World Scientific Publishing Company}},
	title = {Nonhomogeneous Matrix Products},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/9810246285},
	year = {2002}
}


@misc{citeulike:1576193,
	abstract = {This paper surveys a number of fundamental results
on the existence and uniqueness of fixed points for certain
classes of possibly nonlinear operators. I do not try to
be exhaustive, but merely to present the results that are more
useful in the context of signal and image reconstruction. Some
specific aspects pertaining to linear operators, and linear operators
in finite dimensional spaces, are also discussed. It is
shown that the set of fixed points of a nonexpansive operator is
either empty...},
	author = {Ferreira, P. },
	citeulike-article-id = {1576193},
	comment = {(private-note)- basic theorems and properties of contraction mappings
- relates spectral norm (induced l-2 norm) to spectral radius (largest eigenvalue magnitude), identical for Hermitian
- Brouwer's fixed point theorem from Sperner's lemma for graphs},
	keywords = {dynamical-systems},
	priority = {2},
	title = {Fixed point problems --- an introduction},
	url = {http://citeseer.ist.psu.edu/574465.html},
	year = {1996}
}


@article{citeulike:1575880,
	address = {Cambridge, MA, USA},
	author = {Ihler, Alexander  T.  and Fischer, John  W.  and Willsky, Alan  S. },
	citeulike-article-id = {1575880},
	comment = {larger version of "message errors" paper},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {message-passing},
	pages = {905--936},
	priority = {2},
	publisher = {MIT Press},
	title = {Loopy Belief Propagation: Convergence and Effects of Message Errors},
	url = {http://portal.acm.org/citation.cfm?id=1088703},
	volume = {6},
	year = {2005}
}


@article{citeulike:1572774,
	abstract = {Abstract: Exotic semirings such as the “(max; +) semiring” (R[f\^{o}1g; max; +), or the “tropical
semiring” (N[f+1g;min; +), have been invented and reinvented many times since the late fifties,
in relation with various fields: performance evaluation of manufacturing systems and discrete event
system theory; graph theory (path algebra) and Markov decision processes, Hamilton-Jacobi theory;
asymptotic analysis (low temperature asymptotics in statistical physics, large deviations,WKB
method); language theory (automata with multiplicities).
Despite this apparent profusion, there is a small set of common, non-naive, basic results and problems,
in general not known outside the (max; +) community, which seem to be useful in most applications.
The aim of this short survey paper is to present what we believe to be the minimal core
of (max; +) results, and to illustrate these results by typical applications, at the frontier of language
theory, control, and operations research (performance evaluation of discrete event systems, analysis
of Markov decision processes with average cost).
Basic techniques include: solving all kinds of systems of linear equations, sometimes with exotic
symmetrization and determinant techniques; using the (max; +) Perron-Frobenius theory to study
the dynamics of (max; +) linearmaps.We point out some open problems and current developments.
Key-words: Max-algebra, tropical semiring, dioid, idempotent semiring, linear equations, semimodule,
Perron-Frobenius theorem, linear dynamical systems, discrete event systems, Markov decision
processes, dynamic programming, asymptotic calculus.},
	author = {Gaubert, Stephane  },
	citeulike-article-id = {1572774},
	comment = {Tetris contours are products of matrices, perron-frobenius theory for max-plus},
	keywords = {linear-algebra},
	priority = {2},
	title = {Methods and Applications of (max,+) Linear
Algebra},
	year = {1997}
}


@article{citeulike:1550427,
	address = {New York, NY, USA},
	author = {Achlioptas, Dimitris   and Mcsherry, Frank  },
	citeulike-article-id = {1550427},
	comment = {gives test for determining the best rank (when residual matrix is similar to random matrix)},
	doi = {10.1145/1219092.1219097},
	issn = {0004-5411},
	journal = {J. ACM},
	keywords = {linear-algebra},
	month = {April},
	number = {2},
	priority = {2},
	publisher = {ACM Press},
	title = {Fast computation of low-rank matrix approximations},
	url = {http://portal.acm.org/citation.cfm?id=1219097},
	volume = {54},
	year = {2007}
}


@article{citeulike:1542324,
	address = {Duluth, MN, USA},
	author = {H\aastad, Johan  },
	citeulike-article-id = {1542324},
	comment = {Reduction to 3-sat},
	doi = {10.1016/0196-6774(90)90014-6},
	issn = {0196-6774},
	journal = {J. Algorithms},
	keywords = {computational-complexity, linear-algebra},
	month = {December},
	number = {4},
	pages = {644--654},
	priority = {2},
	publisher = {Academic Press, Inc.},
	title = {Tensor rank is NP-complete},
	url = {http://portal.acm.org/citation.cfm?id=95990},
	volume = {11},
	year = {1990}
}


@article{citeulike:1542203,
	abstract = {Tensor decompositions are introduced as a novel
approach to probabilistic classification and can
be interpreted as a particular kind of mixture
model. Since many problems in medicine and
biology can be described as a classification problem,
the approach is seen as a useful tool for
biomedical data analysis. The approach is validated
by means of a clinical database consisting
of data about 1002 patients that suffer from hepatic
disease. It is shown that the approach performs
comparably to state-of-the-art results that
have been obtained using a naive Bayes classifier.},
	author = {van Gerven, Marcel  },
	citeulike-article-id = {1542203},
	comment = {A view of rank-1 expansion as a mixture model},
	keywords = {linear-algebra},
	priority = {2},
	title = {Tensor Decompositions for Probabilistic Classification}
}


@article{citeulike:1529068,
	abstract = {This paper studies a difficult and fundamental problem that arises throughout electrical engineering, applied mathematics, and statistics. Suppose that one forms a short linear combination of elementary signals drawn from a large, fixed collection. Given an observation of the linear combination that has been contaminated with additive noise, the goal is to identify which elementary signals participated and to approximate their coefficients. Although many algorithms have been proposed, there is little theory which guarantees that these algorithms can accurately and efficiently solve the problem. This paper studies a method called convex relaxation, which attempts to recover the ideal sparse signal by solving a convex program. This approach is powerful because the optimization can be completed in polynomial time with standard scientific software. The paper provides general conditions which ensure that convex relaxation succeeds. As evidence of the broad impact of these results, the paper describes how convex relaxation can be used for several concrete signal recovery problems. It also describes applications to channel coding, linear regression, and numerical analysis.},
	author = {Tropp, J. A. },
	citeulike-article-id = {1529068},
	comment = {survey of convex relaxation approximation techinques (l1) for basis pursuit},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {regularization},
	number = {3},
	pages = {1030--1051},
	priority = {2},
	title = {Just relax: convex programming methods for identifying sparse signals in noise},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1603770},
	volume = {52},
	year = {2006}
}


@incollection{NIPS2006_849,
	address = {Cambridge, MA},
	author = {Lee, Su  I.  and Ganapathi, Varun   and Koller, Daphne  },
	booktitle = {Advances in Neural Information Processing Systems 19},
	citeulike-article-id = {1526374},
	comment = {Compares grafting/Pietra's gain based heuristics for adding features},
	editor = {Sch\"{o}lkopf, B.  and Platt, J.  and Hoffman, T. },
	keywords = {graphical},
	pages = {817--824},
	priority = {2},
	publisher = {MIT Press},
	title = {Efficient Structure Learning of Markov Networks using L1-Regularization},
	year = {2007}
}


@article{citeulike:1525882,
	author = {Kleptsyna and Veretennikovy},
	citeulike-article-id = {1525882},
	comment = {Handles the "average data" case?},
	keywords = {ergodicity, hmm},
	priority = {2},
	title = {On discrete time ergodic filters with wrong initial data},
	year = {2007}
}


@article{citeulike:1525863,
	author = {Gland, Le  },
	citeulike-article-id = {1525863},
	comment = {Larger version of earlier report},
	keywords = {ergodicity, hmm},
	priority = {2},
	title = {Stability and uniform approximation of nonlinear filters using the Hilbert metric and application to particle filters},
	url = {http://projecteuclid.org/DPubS?service=UI\&\#38;version=1.0\&\#38;verb=Display\&\#38;handle=euclid.aoap/1075828050},
	year = {2001}
}


@misc{citeulike:1525860,
	abstract = {We study the a.s. exponential stability of the optimal filter w.r.t. its initial conditions. A
bound is provided on the exponential rate (equivalently, on the memory length of the filter)
for a general setting both in discrete and in continuous time, in terms of Birkhoff's contraction
coefficient. Criteria for exponential stability and explicit bounds on the rate are given in the
specific cases of a diffusion process on a compact manifold, and discrete time Markov chains on
both continuous and...},
	author = {Atar, R.  and Zeitouni, O. },
	citeulike-article-id = {1525860},
	comment = {Anoter application of Birkhoff's coefficient to prove contraction bounds},
	keywords = {ergodicity, hmm},
	priority = {2},
	title = {Exponential stability for nonlinear filtering},
	url = {http://citeseer.ist.psu.edu/atar96exponential.html},
	year = {1996}
}


@article{citeulike:1524082,
	author = {Kohlberg, Elon   and Pratt, John  W. },
	citeulike-article-id = {1524082},
	comment = {- Hilbert's metric is the only metric (up to bijection) under which every positive linear transformation is a contraction for points in R+.
- Definitions of projective metrics, properties of Hilberts metric
- Birkoff's contraction coefficient alternatively defined by the contraction of closest points},
	journal = {Mathematics of Operations Research},
	keywords = {ergodicity, hmm},
	number = {2},
	pages = {198--210},
	priority = {2},
	title = {The Contraction Mapping Approach to the Perron-Frobenius Theory: Why Hilbert's Metric?},
	url = {http://links.jstor.org/sici?sici=0364-765X\%28198205\%297\%3A2\%3C198\%3ATCMATT\%3E2.0.CO\%3B2-0},
	volume = {7},
	year = {1982}
}


@article{citeulike:1523434,
	author = {Hennion, H. },
	citeulike-article-id = {1523434},
	comment = {central-limit like theorems for products of random matrices in projective spaces},
	journal = {The Annals of Probability},
	keywords = {hmm, linear-algebra},
	number = {4},
	pages = {1545--1587},
	priority = {2},
	title = {Limit Theorems for Products of Positive Random Matrices},
	url = {http://links.jstor.org/sici?sici=0091-1798\%28199710\%2925\%3A4\%3C1545\%3ALTFPOP\%3E2.0.CO\%3B2-P},
	volume = {25},
	year = {1997}
}


@misc{citeulike:1523316,
	abstract = {We derive novel conditions that guarantee convergence of the Sum-Product
algorithm (also known as Loopy Belief Propagation or simply Belief Propagation)
to a unique fixed point, irrespective of the initial messages. The
computational complexity of the conditions is polynomial in the number of
variables. In contrast with previously existing conditions, our results are
directly applicable to arbitrary factor graphs (with discrete variables) and
are shown to be valid also in the case of factors containing zeros, under some
additional conditions. We compare our bounds with existing ones, numerically
and, if possible, analytically. For binary variables with pairwise
interactions, we derive sufficient conditions that take into account local
evidence (i.e., single variable factors) and the type of pair interactions
(attractive or repulsive). It is shown empirically that this bound outperforms
existing bounds.},
	author = {Mooij, Joris  M.  and Kappen, Hilbert  J. },
	citeulike-article-id = {1523316},
	comment = {Larger version of their UAI paper},
	eprint = {cs/0504030v2},
	keywords = {hmm},
	month = {May},
	priority = {2},
	title = {Sufficient conditions for convergence of the Sum-Product Algorithm},
	url = {http://arxiv.org/abs/cs/0504030v2},
	year = {2007}
}


@article{citeulike:1484972,
	abstract = {A finite set A of N x N nilpotent commutative matrices that have one-dimensional joint kernel is considered. The theorem (due to Suprunenko and Tyshkevich) that the algebra generated by A and the identity matrix has dimension equal to N is proved. A canonical basis for is given, and related structure constants are discussed.},
	author = {Kosir, Tomaz  },
	citeulike-article-id = {1484972},
	comment = {discusses dimension of the algebra generated by commutative nilpotent NxN matrices + identity matrix (it's N)},
	doi = {10.1016/S0024-3795(96)00413-2},
	journal = {Linear Algebra and its Applications},
	keywords = {linear-algebra},
	month = {August},
	number = {1-3},
	pages = {293--305},
	priority = {2},
	title = {On the structure of commutative matrices. II},
	url = {http://dx.doi.org/10.1016/S0024-3795(96)00413-2},
	volume = {261},
	year = {1997}
}


@article{citeulike:1471813,
	abstract = {We describe a part-based object-recognition framework,
specialized to mining complex 3D objects from detailed 3D
images. Objects are modeled as a collection of parts together
with a pairwise potential function. The algorithm’s
key component is an efficient inference algorithm, based on
belief propagation, that finds the optimal layout of parts,
given some input image. Belief Propagation (BP) – a message
passing method for approximate inference in graphical
models – is well suited to this task. However, for large
objects with many parts, even BP may be intractable. We
present AggBP, a message aggregation scheme for BP, in
which groups of messages are approximated as a single
message, producing a message update analogous to that of
mean-field methods. For objects consisting of N parts, we
reduce CPU time and memory requirements from O(N2) to
O(N). We apply AggBP to both real-world and synthetic
tasks. First, we use our framework to recognize protein
fragments in three-dimensional images. Scaling BP to this
task for even average-sized proteins is infeasible without
our enhancements. We then use a synthetic “object generator”
to test our algorithm’s ability to locate a wide variety
of part-based objects. These experiments show that our improvements
result in minimal loss of accuracy, and in some
cases produce a more accurate solution than standard BP.},
	author = {Frank},
	citeulike-article-id = {1471813},
	comment = {Decrease complexity from N^2 to N using some kind of state aggregation},
	keywords = {graphical, hmm},
	priority = {2},
	title = {Improving the Efficiency of Belief Propagation in Large, Highly Connected Graphs},
	year = {2006}
}


@mastersthesis{citeulike:1460599,
	abstract = {introduction to bp, connections to thermodynamics},
	author = {Kao, Dennis  },
	citeulike-article-id = {1460599},
	comment = {intro to belief propagation, connections to thermodynamics},
	keywords = {hmm, message-passing},
	priority = {2},
	title = {Belief Propagation}
}


@misc{mooijsufficient,
	author = {Mooij, Joris  M.  and Kappen, Hilbert  J. },
	citeulike-article-id = {1460576},
	comment = {Similar results to "message errors" paper, quotient spaces},
	institution = {Dept. of Biophysics, Inst. for Neuroscience Radboud University Nijmegen},
	keywords = {hmm, message-passing},
	priority = {2},
	title = {Sufficient conditions for convergence of Loopy Belief Propagation}
}


@inproceedings{ihler2005message,
	author = {Ihler, A.  and Hunt, Harry  B.  and Willsky, Alan  S. },
	booktitle = {NIPS},
	citeulike-article-id = {1460573},
	comment = {Measures contraction of error in terms of the "mixing" rate for the potential (eq 12), a measure of uniformity of the transition matrix},
	keywords = {hmm},
	priority = {2},
	title = {Message Errors in Belief Propagation},
	year = {2005}
}


@article{citeulike:1458081,
	abstract = {We address the problem of filtering and fixed-lag smoothing for discrete-time and discrete-state hidden Markov models (HMMs), with the intention of extending some important results in Kalman filtering, notably the property of exponential stability. By appealing to a generalized Perron-Frobenius result for non-negative matrices, we are able to demonstrate exponential forgetting for both the recursive filters and smoothers; furthermore, methods for deriving overbounds on the convergence rate are indicated. Simulation studies for a two-state and two-output HMM verify qualitatively some of the theoretical predictions, and the observed convergence rate is shown to be bounded in accordance with the theoretical predictions},
	author = {Shue, L.  and Anderson, D. O.  and Dey, S. },
	citeulike-article-id = {1458081},
	comment = {gives rates of convergence in terms of Birkoff coefficient (bound on the contraction of the "projective distance"). Some tricks for getting tighter bounds on Birkoff coefficient, but no quantitative connections to any real quantities.
Birkoff coefficient is insensitive to cancellations of observation matrices, so bounds are not tight},
	journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	keywords = {hmm},
	number = {8},
	pages = {2180--2194},
	priority = {2},
	title = {Exponential stability of filters and smoothers for hidden Markov models},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=705429},
	volume = {46},
	year = {1998}
}


@article{citeulike:1455795,
	abstract = {A representation for a weakly ergodic sequence of (nonstochastic) matrices allows products of nonnegative matrices which eventually become strictly positive to be expressed via products of some associated stochastic matrices and ratios of values of a certain function. This formula used in a random setup leads to a representation for the logarithm of a random matrix product. If the sequence of random matrices is in addition stationary then automatically almost all sequences are weakly ergodic, and the representation is expressed in terms of an one-dimensional stationary process. This permits properties of products of random matrices to be deduced from the latter. Second moment assumptions guarantee that central limit theorems and laws of the iterated logarithm hold for the random matrix products if and only if they hold for the corresponding stationary process. Finally, a central limit theorem for some classes of weakly dependent stationary random matrices is derived doing away with the restriction of boundedness of the ratios of colum entries assumed by previous studies. Extensions beyond stationarity are discussed.},
	author = {Cohn, Harry   and Nerman, Olle   and Peligrad, Magda  },
	citeulike-article-id = {1455795},
	comment = {Space-time harmonic trick to express non-negative matrices in terms of stochastic matrices},
	doi = {10.1007/BF01047581},
	journal = {Journal of Theoretical Probability},
	keywords = {hmm},
	month = {April},
	number = {2},
	pages = {389--405},
	priority = {2},
	title = {Weak ergodicity and products of random matrices},
	url = {http://dx.doi.org/10.1007/BF01047581},
	volume = {6},
	year = {1993}
}


@article{citeulike:1451207,
	author = {Conlisk, John  },
	citeulike-article-id = {1451207},
	comment = {gives definition of some matrix norms (like Holder norm), properties},
	keywords = {hmm, linear-algebra},
	priority = {2},
	title = {Quick Stability Checks and Matrix Norms},
	url = {http://links.jstor.org/sici?sici=0013-0427\%28197311\%292\%3A40\%3A160\%3C402\%3AQSCAMN\%3E2.0.CO\%3B2-2}
}


@article{citeulike:1449673,
	author = {Mitrophanov},
	citeulike-article-id = {1449673},
	comment = {Gives a bound on sensitivity of forward probabilities as a function of perturbations in observation/transition matrix, to show that there's more sensitivity to observation parameters},
	keywords = {hmm},
	priority = {2},
	title = {Sensitivity of hidden Markov models},
	url = {http://projecteuclid.org/DPubS?service=UI\&\#38;version=1.0\&\#38;verb=Display\&\#38;handle=euclid.jap/1127322017}
}


@inproceedings{citeulike:1447735,
	abstract = {Inspired by the work of Daubechies and Lagarias on a set of matrices with convergent infinite products, we study the geometric approach to the classical problem of (weakly) ergodic non-homogeneous Markov chains. The existing key inequalities (related to the Hajnal inequality) in the literature are unified in this geometric picture. A more general inequality is established. Important quantities introduced by various authors are easily interpreted. A quantitative connection is established between ...},
	author = {Shen, J. },
	booktitle = {Proc. Wavelet Analysis and Multiresolution Methods},
	citeulike-article-id = {1447735},
	comment = {Represents stochastic matrices as simplices, proves contractions properties from geometric standpoint},
	editor = {He, T. X. },
	keywords = {hmm},
	priority = {2},
	title = {A geometric approach to ergodic nonhomogeneous Markov chains},
	url = {http://citeseer.ist.psu.edu/551682.html},
	year = {2000}
}


@misc{citeulike:899676,
	abstract = {We derive sufficient conditions for a family \$(X^n,\rho\_n,P\_n)\$ of metric
probability spaces to have the measure concentration property. Specifically, if
the sequence \$\{P\_n\}\$ of probability measures satisfies a strong mixing
condition (which we call \$\eta\$-mixing) and the sequence of metrics
\$\{\rho\_n\}\$ is what we call \$\Psi\$-dominated, we show that \$(X^n,\rho\_n,P\_n)\$
is a normal Levy family. We establish these properties for some metric
probability spaces, including the possibly novel \$X=[0,1]\$, \$\rho\_n=\ell\_1\$
case.},
	author = {Kontorovich, Leonid  },
	citeulike-article-id = {899676},
	comment = {gives Doeblin ergodicity coefficient (like Dobrushin, but for column-stochastic matrices).

Also shows that the Hidden Markov process is at least as concentrated as the underlying Markov process},
	eprint = {math.PR/0610427},
	keywords = {hmm},
	month = {Oct},
	priority = {2},
	title = {Metric and Mixing Sufficient Conditions for Concentration of Measure},
	url = {http://arxiv.org/abs/math.PR/0610427},
	year = {2006}
}


@article{citeulike:1447625,
	abstract = {We give a simple proof of a closed-form expression for the coefficient of ergodicity of a column-allowable nonnegative matrix.},
	author = {Artzrouni, Marc   and Li, Xuefeng  },
	citeulike-article-id = {1447625},
	comment = {Derivation of closed form of Birkoff's coefficient. Also relation to Dobrushin (l1) coefficient -- Birkoff coef of A is equivalent to Dobrushin if we first scale to columns to have the same length},
	journal = {Linear Algebra and its Applications},
	keywords = {hmm},
	month = {January},
	pages = {93--101},
	priority = {2},
	title = {A note on the coefficient of ergodicity of a column-allowable nonnegative matrix},
	url = {http://www.sciencedirect.com/science/article/B6V0R-3YCM2XC-M/2/5fc2301ae6463dd5dbf564dd92722794},
	volume = {214},
	year = {1995}
}


@article{citeulike:1445147,
	abstract = {Discrete event systems provide a useful abstraction for modelling a wide variety of systems: digital circuits, communication networks, manufacturing plants, etc. Their dynamics-stability, equilibrium states, cyclical behaviour, asymptotic average delays-are of vital importance to system designers. However, in marked contrast to continuous dynamical systems, there has been little systematic mathematical theory that designers can draw upon. In this paper, we survey the development of such a theory, based on the dynamics of maps which are nonexpansive in the \&unknown;~ norm. This has its origins in linear algebra over the max-plus semiring but extends to a nonlinear theory that encompasses a variety of problems arising in other mathematical disciplines. We concentrate on the mathematical aspects and set out several open problems.},
	author = {Gunawardena, J. },
	citeulike-article-id = {1445147},
	comment = {General overview of non-expansive maps for discrete systems, some connections of continuous dynamic theory},
	keywords = {hmm},
	month = {February},
	pages = {141--167},
	priority = {2},
	title = {From max-plus algebra to nonexpansive mappings: a nonlinear theory for discrete event systems},
	url = {http://www.ingentaconnect.com/content/els/03043975/2003/00000293/00000001/art00235}
}


@misc{citeulike:1444731,
	abstract = {We study the a.s. exponential stability of the optimal filter w.r.t. its initial conditions. A
bound is provided on the exponential rate (equivalently, on the memory length of the filter)
for a general setting both in discrete and in continuous time, in terms of Birkhoff's contraction
coefficient. Criteria for exponential stability and explicit bounds on the rate are given in the
specific cases of a diffusion process on a compact manifold, and discrete time Markov chains on
both continuous and...},
	author = {Atar, R.  and Zeitouni, O. },
	citeulike-article-id = {1444731},
	comment = {More connections to Lyapunov exponents, application of weak ergodicity},
	keywords = {hmm},
	priority = {2},
	title = {Exponential stability for nonlinear filtering},
	url = {http://citeseer.ist.psu.edu/313187.html},
	year = {1996}
}


@misc{citeulike:1444721,
	abstract = {In this paper, we address the problem of filtering and fixed-lag smoothing for discrete-time
and discrete-state Hidden Markov Models (HMMs), with the intention of extending some important
results in Kalman filtering, notably the property of exponential stability. By appealing
to a generalised Perron-Frobenius result for nonnegative matrices, we are able to demonstrate
exponential forgetting for both the recursive filters and smoothers; furthermore, methods for
deriving overbounds on the...},
	author = {Shue, L.  and Anderson, B.  and Dey, S. },
	citeulike-article-id = {1444721},
	comment = {Gives upper bounds and averages for contraction rates of HMM's},
	keywords = {hmm},
	priority = {2},
	title = {Exponential stability of filters and smoothers for hidden Markov models},
	url = {http://citeseer.ist.psu.edu/235038.html},
	year = {1998}
}


@article{citeulike:1440881,
	abstract = {We determine a sufficient condition for the convergence to 0 of general products formed from a sequence of real or complex matrices. Our result is applied to obtain a condition for the weak ergodicity of an inhomogeneous Markov chain. We make some remarks comparing coefficients of ergodicity and we give a method for constructing these.},
	author = {Neumann, Michael   and Schneider, Hans  },
	citeulike-article-id = {1440881},
	comment = {Definition of coefficient of ergodicity},
	journal = {Linear Algebra and its Applications},
	keywords = {hmm},
	month = {January},
	number = {1-3},
	pages = {307--314},
	priority = {2},
	title = {The convergence of general products of matrices and the weak ergodicity of Markov chains},
	url = {http://www.sciencedirect.com/science/article/B6V0R-3VXSMS7-M/2/330043d5f5f473c4ca715577addc10e1},
	volume = {287},
	year = {1999}
}


@article{citeulike:1440877,
	abstract = {Given a square matrix A and a norm || ||, the coefficient of ergodicity of A with respect to || || is defined as max , ||x|| = 1, xTF = 0} with F as a matrix satisfying AF = 0. We demonstrate that for a bounded set of such matrices with all coefficients of ergodicity of the matrices in the set below 1, all sequences constructed through inhomogenous products of matrices from the set converge geometrically.{},
	author = {Hartfiel, D. J.  and Rothblum, Uriel  G. },
	citeulike-article-id = {1440877},
	journal = {Linear Algebra and its Applications},
	keywords = {hmm},
	month = {June},
	number = {1-3},
	pages = {1--9},
	priority = {2},
	title = {Convergence of inhomogenous products of matrices and coefficients of ergodicity},
	url = {http://www.sciencedirect.com/science/article/B6V0R-3TKS76P-1C/2/1767b65ee67fb011021226bb88c8990b},
	volume = {277},
	year = {1998}
}


@article{citeulike:1439973,
	author = {Anderson},
	citeulike-article-id = {1439973},
	comment = {some technical conditions for forgetting properties},
	keywords = {hmm},
	priority = {2},
	title = {New developments in the theory of positive matrices},
	year = {1997}
}


@article{citeulike:1438398,
	abstract = {The authors investigate common properties of 3 types of filters obtained by considering various stochastic models; Wiener filters, Kalman filters and hidden Markov model (HMM) filters. Unifying features which particularly stand out are the forgetting of old data and of initial conditions, and protection from round-off error effects' overpowering the calculations. They differentiate the concept of fixed-lag smoothing from filtering, and expose the comparative advantages and disadvantages. Once again, there are common properties which allow a unified viewpoint. We focus especially on characterizations of a maximally effective smoothing lag, and identification of the SNR circumstances under which smoothing is especially beneficial. The motivation is the processing of data from an array of acoustic sensors towed by a submarine},
	author = {Anderson, B. D. O. },
	citeulike-article-id = {1438398},
	comment = {forgetting properties in three types of systems},
	journal = {Control Systems Magazine, IEEE},
	keywords = {hmm},
	number = {3},
	pages = {41--51},
	priority = {2},
	title = {From Wiener to hidden Markov models},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=768539},
	volume = {19},
	year = {1999}
}


@misc{citeulike:1438337,
	abstract = {We consider an hidden Markov model with multidimensional observations, and with misspecification,
i.e. the assumed coefficients (transition probability matrix, and observation
conditional densities) are possibly different from the true coefficients. Under mild assumptions
on the coefficients of both the true and the assumed models, we prove that : (i) the
prediction filter, and its gradient w.r.t. some parameter in the model, forget almost surely
their initial condition exponentially fast, and...},
	author = {Legland, F.  and Mevel, L. },
	citeulike-article-id = {1438337},
	comment = {Non-asymptotic forgetting rates},
	keywords = {hmm},
	priority = {2},
	title = {Exponential Forgetting and Geometric Ergodicity in Hidden Markov Models},
	url = {http://citeseer.ist.psu.edu/53964.html}
}


@article{citeulike:1438278,
	citeulike-article-id = {1438278},
	comment = {Formulates rate of convergence as the difference between the top two lyapunov exponents
},
	keywords = {hmm},
	priority = {2},
	title = {SOME UNIFORM ESTIMATES IN PRODUCTS OF RANDOM MATRICES}
}


@misc{citeulike:1438184,
	abstract = {Let A k , k


N be a sequence of n


n matrices which converge to a matrix

A. If A and each A k is positive then the product


||Ak

Ak-1 ...A2 A1


converges to

a rank one matrix positive matrix uw


, where u is a positive column eigenvector

of A. If each A k is nonsingular and A has exactly one simple eigenvalue

 \# of the maximal modulus with the corresponding eigenvector u, then

e

-1\#k


k Ak-1 ...A2 A1


, \# k


R converges to a rank one matrix uw


.},
	author = {Friedland, Shmuel  },
	citeulike-article-id = {1438184},
	comment = {overview of matrix product convergence},
	keywords = {hmm},
	priority = {2},
	title = {Convergence of Products of Matrices in Projective Spaces},
	url = {http://citeseer.ist.psu.edu/693184.html}
}


@article{citeulike:1438176,
	abstract = {This note concerns the projective contraction coefficient [tau](H) of a rectangular matrix H with positive entries. A simple proof of an explicit formula for [tau](H), originally established by [Trans. Am. Math. Soc. 85 (1957) 219], is given. The motivation for this work comes from the area of Markov decision processes, and the argument is based on elementary differential calculus.},
	author = {Cavazos-Cadena, Rolando  },
	citeulike-article-id = {1438176},
	comment = {A simpler proof of contraction property of positive matrices},
	journal = {Linear Algebra and its Applications},
	keywords = {hmm},
	month = {December},
	pages = {291--297},
	priority = {2},
	title = {An alternative derivation of Birkhoff's formula for the contraction coefficient of a positive matrix},
	url = {http://www.sciencedirect.com/science/article/B6V0R-49JHRC9-1/2/644b6281aedc772c84866064e163ef48},
	volume = {375},
	year = {2003}
}


@article{citeulike:1438168,
	abstract = {Presented here is a new proof of the theorem of Garrett Birkhoff which states that multiplication by any positive square matrix induces a contraction mapping on positive projective space with respect to the Hilbert projective metric and also evaluates the contraction coefficient.},
	author = {Carroll, Joseph  E. },
	citeulike-article-id = {1438168},
	comment = {Proves contraction property of positive matrices (non-constructive)},
	journal = {Linear Algebra and its Applications},
	keywords = {hmm},
	month = {September},
	pages = {227--234},
	priority = {2},
	title = {Birkhoff's contraction coefficient},
	url = {http://www.sciencedirect.com/science/article/B6V0R-4CN9T9V-F/2/b62752957d530150df9fe2502936849c},
	volume = {389},
	year = {2004}
}


@article{citeulike:153943,
	abstract = {Advances in neurobiology permit neuroscientists to manipulate specific brain molecules, neurons and systems. This has lead to major advances in the neuroscience of reward. Here, it is argued that further advances will require equal sophistication in parsing reward into its specific psychological components: (1) learning (including explicit and implicit knowledge produced by associative conditioning and cognitive processes); (2) affect or emotion (implicit 'liking' and conscious pleasure) and (3) motivation (implicit incentive salience 'wanting' and cognitive incentive goals). The challenge is to identify how different brain circuits mediate different psychological components of reward, and how these components interact.},
	address = {Department of Psychology, Biopsychology Program, University of Michigan, Ann Arbor, MI 48109-1109, USA. berridge@umich.edu},
	author = {Berridge, K. C.  and Robinson, T. E. },
	citeulike-article-id = {153943},
	comment = {3 kinds of reward, motivation, liking and wanting},
	issn = {0166-2236},
	journal = {Trends Neurosci},
	keywords = {neuroscience},
	month = {September},
	number = {9},
	pages = {507--513},
	priority = {2},
	title = {Parsing reward.},
	url = {http://view.ncbi.nlm.nih.gov/pubmed/12948663},
	volume = {26},
	year = {2003}
}


@misc{citeulike:1432417,
	abstract = {We use basic properties of the projective product, to obtain exponential bounds for the
Lipschitz constant associated with the projective product of column--allowable nonnegative
matrices. We obtain similar bounds for the associated linear tangent maps.

Keywords : projective product, Birkhoff contraction coefficient, cocycle, exponential
forgetting, product of matrices.


This work was partially supported by the Commission of the European Communities, under the SCIENCE
project System...},
	author = {Le Gland, F.  and Mevel, L. },
	citeulike-article-id = {1432417},
	comment = {bounds on the non-singularity of inhomogenous products of matrices},
	keywords = {hmm},
	priority = {2},
	title = {Basic properties of the projective product with application to products of column-allowable nonnegative matrices},
	url = {http://citeseer.ist.psu.edu/176894.html},
	year = {2000}
}


@misc{citeulike:1432415,
	abstract = {We study the asymptotic behaviour of points under matrix cocyles generated by rectangular matrices.
In particular we prove a random Perron-Frobenius and a Multiplicative Ergodic Theorem. We
also provide an example where such products of random rectangular matrices arise in the theory
of random walks in random environments and where the Multiplicative Ergodic Theorem can be
used to investigate recurrence problems.
Key words: random dynamical system, products of random matrices, random walks in...},
	author = {Gundlach, V.  and Steinkamp, O. },
	citeulike-article-id = {1432415},
	comment = {random Perron-Frobenius theorem},
	keywords = {hmm},
	priority = {2},
	title = {Products of random rectangular matrices},
	url = {http://citeseer.ist.psu.edu/gundlach95products.html},
	year = {1995}
}


@book{citeulike:1431984,
	abstract = {{<P>This book is a photographic reproduction of the book of the same title published in 1981, for which there has been continuing demand on account of its accessible technical level. Its appearance also helped generate considerable subsequent work on inhomogeneous products of matrices. This printing adds an additional bibliography on coefficients of ergodicity and a list of corrigenda.</P> <P>Eugene Seneta received his Ph.D. in 1968 from the Australian National University. He left Canberra in 1979 to become Professor and Head of the Department of Mathematical Statistics at the University of Sydney. He has been a regular visitor to the United States, most frequently to the University of Virginia. Now Emeritus Professor at the University of Sydney, he has recently developed a renewed interest in financial mathematics. He was elected Fellow of the Australian Academy of Science in 1985 and awarded the Pitman Medal of the Statistical Society of Australia for his distinguished research contributions.</P> <P>The first edition of this book, entitled Non-Negative Matrices, appeared in 1973, and was followed in 1976 by his Regularly Varying Functions in the Springer Lecture Notes in Mathematics, later translated into Russian. Both books were pioneering in their fields. In 1977, Eugene Seneta coauthored (with C. C. Heyde ) I.J. Bienaym\'{e} : Statistical Theory Anticipated, which is effectively a history of probability and statistics in the 19<SUP>th</SUP> century, and in 2001 co-edited with the same colleague Statisticians of the Centuries, both published by Springer. Having served on the editorial board of the Encyclopedia of Statistical Science, he is currently Joint Editor of the International Statistical Review.</P>}},
	author = {Seneta, E. },
	citeulike-article-id = {1431984},
	comment = {Referenced by Y.Bengio and LeGland},
	howpublished = {Paperback},
	isbn = {0387297650},
	keywords = {hmm},
	month = {January},
	priority = {2},
	publisher = {Springer},
	title = {Non-negative Matrices and Markov Chains (Springer Series in Statistics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387297650},
	year = {2006}
}


@misc{citeulike:1431672,
	abstract = {This paper studies the problem of ergodicity of transition probability
matrices in Markovian models, such as hidden Markov models (HMMs), and how it
makes very difficult the task of learning to represent long-term context for
sequential data. This phenomenon hurts the forward propagation of long-term
context information, as well as learning a hidden state representation to
represent long-term context, which depends on propagating credit information
backwards in time. Using results from Markov chain theory, we show that this
problem of diffusion of context and credit is reduced when the transition
probabilities approach 0 or 1, i.e., the transition probability matrices are
sparse and the model essentially deterministic. The results found in this paper
apply to learning approaches based on continuous optimization, such as gradient
descent and the Baum-Welch algorithm.},
	author = {Bengio, Y.  and Frasconi, P. },
	citeulike-article-id = {1431672},
	comment = {Readable overview of Seneta applied to HMM's},
	eprint = {cs.AI/9510101},
	keywords = {hmm},
	month = {Oct},
	priority = {2},
	title = {Diffusion of Context and Credit Information in Markovian Models},
	url = {http://arxiv.org/abs/cs.AI/9510101},
	year = {1995}
}


@misc{citeulike:1423587,
	abstract = {Introduction

This chapter deals with likelihood inference for spatial point processes
using the methods of Moyeed and Baddeley (1991), Geyer
and Thompson (1992), Gelfand and Carlin (1993), Geyer (1994),
and Geyer and Møller (1994) using Markov chain Monte Carlo
(MCMC). The basic message is

If you can write down a model, I can do likelihood inference for it,
not only maximum likelihood estimation, but also likelihood ratio
tests, likelihood based confidence regions, profile likelihoods,...},
	author = {Es, Ss  },
	citeulike-article-id = {1423587},
	comment = {Accessible explanations of mixing time derivations},
	priority = {2},
	title = {Likelihood Inference for Spatial Point Processes},
	url = {http://citeseer.ist.psu.edu/7219.html}
}


@article{citeulike:1417097,
	author = {Zhang},
	citeulike-article-id = {1417097},
	comment = {Gives examples when Naive Bayes classifier is optimal despite violated assumptions},
	keywords = {naivebayes},
	priority = {2},
	title = {EXPLORING CONDITIONS FOR THE OPTIMALITY OF NAIVE BAYES},
	year = {2005}
}


@article{citeulike:1416782,
	address = {Hanover, MA, USA},
	author = {Montenegro, R.  and Tetali, P. },
	citeulike-article-id = {1416782},
	comment = {Large (100+ p) overview of ways to bound mixing time of Markov Chain},
	doi = {10.1561/0400000003},
	journal = {Found. Trends Theor. Comput. Sci.},
	keywords = {hmm},
	month = {May},
	number = {3},
	pages = {237--354},
	priority = {2},
	publisher = {Now Publishers Inc.},
	title = {Mathematical aspects of mixing times in Markov chains},
	url = {http://portal.acm.org/citation.cfm?id=1166414},
	volume = {1},
	year = {2006}
}


@misc{citeulike:1414566,
	abstract = {The purpose of this paper is twofold: (a) to provide a tutorial introduction to some key concepts from the theory of computational complexity, highlighting their relevance to systems and control theory, and (b) to survey the relatively recent research activity lying at the interface between these fields. We begin with a brief introduction to models of computation, the concepts of undecidability, polynomial time algorithms, NP-completeness, and the implications of intractability results. We then ...},
	author = {Blondel, Vincent  D.  and Tsitsiklis, John  N. },
	citeulike-article-id = {1414566},
	comment = {Top lyapunov exponent is unapproximable},
	journal = {Automatica},
	keywords = {computational-complexity, hmm},
	number = {9},
	pages = {1249--1274},
	priority = {2},
	title = {A survey of computational complexity results in systems and control},
	url = {http://citeseer.ist.psu.edu/270441.html},
	volume = {36},
	year = {2000}
}


@article{citeulike:1412989,
	abstract = {This paper considers the relative entropy between the conditional distribution and an incorrectly initialized filter for the estimation of one component of a Markov process given observations of the second component. Using the Markov property, we first establish a decomposition of the relative entropy between the measures on observation path space associated to different initial conditions. Using this decomposition, it is shown that the relative entropy of the optimal filter relative to an incorrectly initialized filter is a positive supermartingale. By applying the decomposition to signals observed in additive, white noise, a relative entropy bound is obtained on the integrated, expected, mean square difference between the optimal and incorrectly initialized estimates of the observation function.},
	author = {Clark},
	citeulike-article-id = {1412989},
	keywords = {hmm},
	priority = {2},
	title = {Relative Entropy and Error Bounds for Filtering of Markov Processes},
	year = {1999}
}


@article{citeulike:1412644,
	author = {Tsitsiklis},
	citeulike-article-id = {1412644},
	comment = {Lyapunov exponent can't be approximated unless P=NP. Note from Tsitsiklis -- this considers general case, and there are efficient algorithms for computing it for special kinds of matrices},
	keywords = {hmm},
	priority = {2},
	title = {The Lyapunov exponent and joint spectral radius of pairs of matrices are hard—when not impossible—to compute and to approximate},
	year = {1997}
}


@techreport{citeulike:1412621,
	abstract = {The Finite-State Markov Channel (FSMC) is a time-varying channel having states that are characterized by a finite-state Markov chain. These channels have infinite memory, which complicates their capacity analysis. We develop a new method to characterize the capacity of these channels based on Lyapunov exponents. Specifically, we show that the input, output, and conditional entropies for this channel are equivalent to the largest Lyapunov exponents for a particular class of random matrix products. We then show that the Lyapunov exponents can be expressed as expectations with respect to the stationary distributions of a class of continuous-state space Markov chains. The stationary distributions for this class of Markov chains are shown to be unique and continuous functions of the input symbol probabilities, provided that the input sequence has finite memory. These properties allow us to express mutual information and channel capacity in terms of Lyapunov exponents. We then leverage this connection between entropy and Lyapunov exponents to develop a rigorous theory for computing or approximating entropy and mutual information for finite-state channels with dependent inputs. We develop a method for directly computing entropy of finite-state channels that does not rely on simulation and establish its convergence. We also obtain a new asymptotically tight lower bound for entropy based on norms of random matrix products. In addition, we prove a new functional central limit theorem for sample entropy and apply this theorem to characterize the error in simulated estimates of entropy. Finally, we present numerical examples of mutual information computation for ISI channels and observe the capacity benefits of adding memory to the input sequence for such channels.},
	author = {Holliday, Tim   and Glynn, Peter   and Goldsmith, Andrea  },
	citeulike-article-id = {1412621},
	comment = {Larger version of "Shannon meets Lyapunov." Shows that entropy of Markov chain can be expressed as a top Lyapunov exponent for product of random matrices},
	keywords = {hmm},
	priority = {2},
	title = {On Entropy and Lyapunov Exponents for Finite-State Channels}
}


@article{citeulike:1410138,
	author = {Algoet, Paul  H.  and Cover, Thomas  M. },
	citeulike-article-id = {1410138},
	comment = {Definition of Shannon-McMillan-Breiman (existance of per symbol entropy limit)},
	journal = {The Annals of Probability},
	keywords = {information-theory},
	number = {2},
	pages = {899--909},
	priority = {2},
	title = {A Sandwich Proof of the Shannon-McMillan-Breiman Theorem},
	url = {http://links.jstor.org/sici?sici=0091-1798\%28198804\%2916\%3A2\%3C899\%3AASPOTS\%3E2.0.CO\%3B2-M},
	volume = {16},
	year = {1988}
}


@misc{citeulike:1410095,
	abstract = {this report will be the second point, with emphasis put on
applications to statistical mechanics of disordered systems. In the next section,
we will define the Lyapunov exponents, and show its connection to
statistical mechanics. Then we discuss and compare the various methods of
obtaining numerical estimates of the Lyapunov exponents. These methods
are Monte-Carlo simulation, weak-disorder expansion [1], the microcanonical
method[2], and the cycle-expansion method[3]. Cycle expansions have...},
	author = {Nielsen, Jakob  L. },
	citeulike-article-id = {1410095},
	keywords = {hmm},
	priority = {2},
	title = {Lyapunov Exponent for Products of Random Matrices},
	url = {http://citeseer.ist.psu.edu/langgaardnielsen97lyapunov.html}
}


@inproceedings{citeulike:1407564,
	abstract = {This paper explores connections between Information Theory, Lyapunov exponents for products of random matrices, and hidden Markov models. Specifically, we will show that entropies associated with finite-state channels are equivalent to Lyapunov exponents. We use this result to show that the traditional prediction filter for hidden Markov models is not an irreducible Markov chain in our problem framework. Hence, we do not have access to many well-known properties of irreducible continuous state space Markov chains (e.g. a unique and continuous stationary distribution). However, by exploiting the connection between entropy and Lyapunov exponents and applying proof techniques from the theory of random matrix products we can solve abroad class of problems related to capacity and hidden Markov models. Our results provide strong regularity results for the non-irreducible prediction filter as well as some novel theoretical tools to address problems in these areas.},
	author = {Holliday, T.  and Glynn, P.  and Goldsmith, A. },
	citeulike-article-id = {1407564},
	comment = {Formulation of H(X) as a Lyapunov top exponent of a product of random matrices},
	journal = {Decision and Control, 2005 and 2005 European Control Conference. CDC-ECC '05. 44th IEEE Conference on},
	keywords = {hmm},
	pages = {1756--1763},
	priority = {2},
	title = {Shannon Meets Lyapunov: Connections between Information Theory and Dynamical Systems},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1582414},
	year = {2005}
}


@article{citeulike:1398948,
	abstract = {We study the temporal connectivity structure of
single-channel Internet-based chat participation streams. Somewhat
similar to bibliometric analysis, and complementary to
topic-analysis, we base our study solely on context information
provided by the temporal order of participants’ contributions.
Experimental results obtained by employing both networkanalysis
indicators and an aggregate Markov modelling approach
indicate the existence of distinguishable communities in the about
one day worth real-world chat dynamics analysed.},
	author = {Kaban and Wang},
	citeulike-article-id = {1398948},
	comment = {Apply Baum-Welch to model P(personA's post follows personB's post) as an HMM with 6 hidden states},
	keywords = {chat},
	priority = {0},
	title = {Context Based Identification of User Communities from Internet Chat}
}


@article{citeulike:1397605,
	abstract = {This paper offers a detailed lesson plan on the forward-
backward algorithm. The lesson is taught from a live, com-
mented spreadsheet that implements the algorithm and graphs
its behavior on a whimsical toy example. By experimenting
with different inputs, one can help students develop intuitions
about HMMs in particular and Expectation Maximization in
general. The spreadsheet and a coordinated follow-up assign-
ment are available},
	author = {Eisner},
	citeulike-article-id = {1397605},
	comment = {gentle introduction to Baum-Welch},
	keywords = {hmm, ssl},
	priority = {2},
	title = {An Interactive Spreadsheet for Teaching the Forward-Backward Algorithm}
}


@article{citeulike:1392987,
	author = {Schluter},
	citeulike-article-id = {1392987},
	comment = {Gives bounds on Bayes error in terms of MCE/Gini criteria},
	priority = {2},
	title = {Investigations on discriminative training criteria}
}


@phdthesis{citeulike:1392618,
	author = {Bergsma},
	citeulike-article-id = {1392618},
	priority = {2},
	title = {Modeling conditional and marginal association in contingency table}
}


@phdthesis{citeulike:1392616,
	author = {Bergsma},
	citeulike-article-id = {1392616},
	comment = {asdfasdf},
	keywords = {loglinear},
	priority = {2},
	title = {Modeling conditional and marginal association in contingency table}
}


@article{citeulike:1392538,
	abstract = {In this letter, we investigate the impact of choosing different loss functions from the viewpoint of statistical learning theory. We introduce a convexity assumption, which is met by all loss functions commonly used in the literature, and study how the bound on the estimation error changes with the loss. We also derive a general result on the minimizer of the expected risk for a convex loss function in the case of classification. The main outcome of our analysis is that for classification, the hinge loss appears to be the loss of choice. Other things being equal, the hinge loss leads to a convergence rate practically indistinguishable from the logistic loss rate and much better than the square loss rate. Furthermore, if the hypothesis space is sufficiently rich, the bounds obtained for the hinge loss are not loosened by the thresholding stage.},
	address = {INFM-DISI, Universit\`{a} di Genova, 16146 Genoa, Italy. rosasco@disi.unige.it},
	author = {Rosasco, L.  and De Vito, E.  and Caponnetto, A.  and Piana, M.  and Verri, A. },
	citeulike-article-id = {1392538},
	comment = {hinge/logistic loss better than squared loss},
	doi = {10.1162/089976604773135104},
	issn = {0899-7667},
	journal = {Neural Comput},
	keywords = {statistical-learning-theory},
	month = {May},
	number = {5},
	pages = {1063--1076},
	priority = {2},
	title = {Are loss functions all the same?},
	url = {http://dx.doi.org/10.1162/089976604773135104},
	volume = {16},
	year = {2004}
}


@phdthesis{citeulike:1392426,
	author = {Bergsma},
	citeulike-article-id = {1392426},
	comment = {intro to logistic models, formulations through design matrix},
	keywords = {loglinear},
	priority = {2},
	title = {Modeling conditional and marginal association in contingency table}
}


@article{citeulike:1390958,
	abstract = {his paper compares ordinary least squares (OLS) and logistic regression in terms of their underlying assumptions and results obtained on common data sets. Two data sets were analyzed with both methods. In the respective studies, the dependent variables were binary codes of 1) dropping out of school and 2) attending a private college. Results of both analyses were very similar. Significance tests (alpha = 0.05) produced identical decisions. OLS and logistic predicted values were highly correlated. Predicted classifications on the dependent variable were identical in study 1 and very similar in study 2. Logistic regression yielded more accurate predictions of dependent variable probabilities as measured by the average squared differences between the observed and predicted probabilities. It was concluded that both models can be used to test relationships with a binary criterion. However, logistic regression is superior to OLS at predicting the probability of an attribute, and should be the model of choice for that application.},
	author = {Pohlmann, John  },
	citeulike-article-id = {1390958},
	priority = {2},
	title = {A comparison of ordinary least squares and logistic regression}
}


@article{citeulike:1390957,
	abstract = {his paper compares ordinary least squares (OLS) and logistic regression in terms of their underlying assumptions and results obtained on common data sets. Two data sets were analyzed with both methods. In the respective studies, the dependent variables were binary codes of 1) dropping out of school and 2) attending a private college. Results of both analyses were very similar. Significance tests (alpha = 0.05) produced identical decisions. OLS and logistic predicted values were highly correlated. Predicted classifications on the dependent variable were identical in study 1 and very similar in study 2. Logistic regression yielded more accurate predictions of dependent variable probabilities as measured by the average squared differences between the observed and predicted probabilities. It was concluded that both models can be used to test relationships with a binary criterion. However, logistic regression is superior to OLS at predicting the probability of an attribute, and should be the model of choice for that application.},
	author = {Pohlmann, John  },
	citeulike-article-id = {1390957},
	keywords = {loglinear},
	priority = {2},
	title = {A comparison of ordinary least squares and logistic regression}
}


@inproceedings{citeulike:1389523,
	abstract = {We present a novel approach to modeling sequences using mixtures of conditional maximum entropy (maxent) distributions. Our method generalizes the mixture of first-order Markov models by including the "long-term" dependencies in model components. The "long-term" dependencies are represented by the frequently used in the natural language processing (NLP) domain probabilistic triggers or rules (such as "A occurred k positions back"/spl rarr/"the current symbol is B" with probability P). The maxent framework is then used to create a coherent global probabilistic model from all selected triggers. We enhance this formalism by using probabilistic mixtures with maxent models as components, thus representing hidden or unobserved effects in the data. We demonstrate how our mixture of conditional maxent models can be learned from data using the generalized EM algorithm that scales linearly in the dimensions of the data and the number of mixture components. We present empirical results on the simulated and real-world data sets and demonstrate that the proposed approach enables us to create better quality models than the mixtures of first-order Markov models and resist overfitting and curse of dimensionality that would inevitably present themselves for the higher order Markov models.},
	author = {Pavlov, D. },
	citeulike-article-id = {1389523},
	journal = {Data Mining, 2003. ICDM 2003. Third IEEE International Conference on},
	keywords = {maxent, ssl},
	pages = {251--258},
	priority = {2},
	title = {Sequence modeling with mixtures of conditional maximum entropy distributions},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1250927},
	year = {2003}
}


@article{citeulike:643155,
	abstract = {A review is given of shrinkage and penalization as tools to improve predictive accuracy of regression models. The James-Stein estimator is taken as starting point. Procedures covered are Pre-test Estimation, the Ridge Regression of Hoerl and Kennard, the Shrinkage Estimators of Copas and Van Houwelingen and Le Cessie, the LASSO of Tibshirani and the Garotte of Breiman. An attempt is made to place all these procedures in a unifying framework of semi-Bayesian methodology. Applications are briefly mentioned, but not amply discussed.},
	author = {Houwelingen, Van  },
	citeulike-article-id = {643155},
	comment = {Explains Hoerl's penalized regression, SURE},
	keywords = {estimation},
	priority = {2},
	title = {Shrinkage and Penalized Likelihood as Methods to Improve Predictive Accuracy},
	url = {http://www.ingentaconnect.com/content/bpl/stan/2001/00000055/00000001/art00154},
	year = {2001}
}


@article{citeulike:620762,
	author = {Rubinstein and Hastie},
	citeulike-article-id = {620762},
	comment = {Gives an example where generative learning with incorrect model assumptions outperforms discriminative learning. Exponential tilts},
	keywords = {generative-discriminative},
	priority = {2},
	title = {Discriminative vs informative learning}
}


@article{citeulike:608500,
	abstract = {A simple probabilistic model is introduced to generalize classical linear discriminant analysis (LDA) in finding components that are informative of or relevant for data classes. The components maximize the predictability of the class distribution which is asymptotically equivalent to 1) maximizing mutual information with the classes, and 2) finding principal components in the so-called learning or Fisher metrics. The Fisher metric measures only distances that are relevant to the classes, that is, distances that cause changes in the class distribution. The components have applications in data exploration, visualization, and dimensionality reduction. In empirical experiments, the method outperformed, in addition to more classical methods, a Renyi entropy-based alternative while having essentially equivalent computational cost.},
	author = {Peltonen, J.  and Kaski, S. },
	citeulike-article-id = {608500},
	journal = {Neural Networks, IEEE Transactions on},
	keywords = {information-geometry},
	number = {1},
	pages = {68--83},
	priority = {2},
	title = {Discriminative components of data},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1388459},
	volume = {16},
	year = {2005}
}


@article{citeulike:228356,
	author = {Roos, Teemu   and Wettig, Hannes   and Grunwald, Peter   and Myllymaki, Petri   and Tirri, Henry  },
	citeulike-article-id = {228356},
	comment = {Equivalence between conditional Bayes nets and logistic regression for perfect canonical graphs},
	doi = {10.1007/s10994-005-0471-6},
	issn = {0885-6125},
	journal = {Machine Learning},
	keywords = {bayesnet, generative-discriminative},
	month = {June},
	number = {3},
	pages = {267--296},
	priority = {2},
	publisher = {Kluwer Academic Publishers},
	title = {On Discriminative Bayesian Network Classifiers and Logistic Regression},
	url = {http://dx.doi.org/10.1007/s10994-005-0471-6},
	volume = {59},
	year = {2005}
}


@book{citeulike:605354,
	author = {Nikiforo, V.  and Uvaro, V. },
	citeulike-article-id = {605354},
	comment = {Properties of gamma function},
	howpublished = {Hardcover},
	isbn = {0817631836},
	month = {January},
	priority = {2},
	publisher = {Birkhauser},
	title = {Special Functions of Mathematical Physics : A UNIFIED INTRODUCTION with applications},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0817631836},
	year = {1988}
}


@article{citeulike:600955,
	abstract = {We describe a method for constructing a family of low rank, penalized scatterplot smoothers. These pseudosplines have shrinking behaviour that is similar to that of smoothing splines. They require two ingredients: a basis and a penalty sequence. The smoother is then computed by a generalized ridge regression. The family can be used to approximate existing high rank smoothers in terms of their dominant eigenvectors. Our motivating example uses linear combinations of orthogonal polynomials to approximate smoothing splines, where the linear combination and the penalty sequence depend on the particular instance of the smoother being approximated. As a leading application, we demonstrate the use of these pseudosplines in additive model computations. Additive models are typically fitted by an iterative smoothing algorithm, and any features other than the fit itself are difficult to compute. These include standard error curves, degrees of freedom, generalized cross-validation and influence diagnostics. By using a low rank pseudospline approximation for each of the smoothers involved, the entire additive fit can be approximated by a corresponding low rank approximation. This can be computed exactly and efficiently, and opens the door to a variety of computations that were not feasible before.},
	author = {Hastie, Trevor  },
	citeulike-article-id = {600955},
	comment = {Defines effective degrees of freedom as the trace of the hat matrix},
	keywords = {estimation},
	priority = {2},
	title = {Pseudosplines},
	url = {http://links.jstor.org/sici?sici=0035-9246\%281996\%2958\%3A2\%3C379\%3AP\%3E2.0.CO\%3B2-Z}
}


@inproceedings{citeulike:600951,
	abstract = {It has been observed in numerical simulations that a weight decay can im- prove generalization in a feed-forward neural network. This paper explains},
	author = {Krogh},
	booktitle = {NIPS},
	citeulike-article-id = {600951},
	comment = {Derives equation for optimal lambda for lowest squared loss in the presence of noise},
	keywords = {regularization},
	priority = {2},
	title = {A Simple Weight Decay Can Improve Generalization},
	year = {1992}
}


@article{citeulike:600941,
	abstract = {We derive the average and worst case number of nodes in decision diagrams of r-valued symmetric functions of n variables. We show that, for large n, both numbers approach n<sup>r</sup>/rl. For binary decision diagrams (r=2), we compute the distribution of the number of functions on n variables with a specified number of nodes. Subclasses of symmetric functions appear as features in this distribution. For example, voting functions are noted as having an average of n<sup>2</sup>/6 nodes, for large n, compared to n<sup>2</sup>/2, for general binary symmetric functions},
	author = {Butler, J. T.  and Herscovici, D. S.  and Sasao, T.  and Barton, R. J. },
	citeulike-article-id = {600941},
	comment = {Finds number of nodes needed to represent any voting function},
	journal = {Computers, IEEE Transactions on},
	keywords = {discriminants},
	number = {4},
	pages = {491--494},
	priority = {2},
	title = {Average and worst case number of nodes in decision diagrams of symmetric multiple-valued functions},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=588065},
	volume = {46},
	year = {1997}
}


@article{citeulike:600940,
	address = {New York, NY, USA},
	author = {Bryant, Randal  E. },
	citeulike-article-id = {600940},
	comment = {Introduces OBDD's, sizes on some problems, every symmetric function is O(n^2)},
	doi = {10.1145/136035.136043},
	issn = {0360-0300},
	journal = {ACM Comput. Surv.},
	keywords = {discriminants},
	month = {September},
	number = {3},
	pages = {293--318},
	priority = {2},
	publisher = {ACM Press},
	title = {Symbolic Boolean manipulation with ordered binary-decision diagrams},
	url = {http://portal.acm.org/citation.cfm?id=136035.136043},
	volume = {24},
	year = {1992}
}


@article{citeulike:600939,
	abstract = {We report new results about the impact of noise on information processing with application to financial markets. These results quantify the trade-off between the amount of data and the noise level in the data. They also provide estimates for the performance of a learning system in terms of the noise level. We use these results to derive a method for detecting the change in market volatility from period to period. We successfully apply these results to the four major foreign exchange markets. The results hold for linear as well as nonlinear learning models and algorithms and for different noise models},
	author = {Magdon-Ismail, M.  and Nicholson, A.  and Abu-Mostafa, Y. S. },
	citeulike-article-id = {600939},
	comment = {Looks at effect of noise and training size on "stable" learning systems. Expected squared error grows linearly with variance, and 1/N with N=training size},
	journal = {Proceedings of the IEEE},
	keywords = {estimation, regularization},
	number = {11},
	pages = {2184--2195},
	priority = {2},
	title = {Financial markets: very noisy information processing},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=726786},
	volume = {86},
	year = {1998}
}


@article{citeulike:600905,
	abstract = {The problem of on-line learning in two-layer neural networks is studied within the framework of statistical mechanics. A fully connected committee machine with K hidden units is trained by gradient descent to perform a task defined by a teacher committee machine with M hidden units acting on randomly drawn inputs. The approach, based on a direct averaging over the activation of the hidden units, results in a set of first-order differential equations that describes the dynamical evolution of the overlaps among the various hidden units and allows for a computation of the generalization error. The equations of motion are obtained analytically for general K and M and provide a powerful tool used here to study a variety of realizable, over-realizable, and unrealizable learning scenarios and to analyze the role of the learning rate in controlling the evolution and convergence of the learning process.},
	author = {Saad and Solla},
	citeulike-article-id = {600905},
	comment = {Computes expectations of functions with respect to Gaussian integrals},
	journal = {Physics Review E},
	priority = {2},
	title = {On-line learning in soft committee machines},
	year = {1995}
}


@article{citeulike:591371,
	abstract = {A long standing mystery in using Maximum Entropy (MaxEnt) is how to deal with constraints whose values are uncertain. This situation arises when constraint values are estimated from data, because of finite sample sizes. One approach to this problem, advocated by E.T. Jaynes, is to ignore this uncertainty, and treat the empirically observed values as exact. We refer to this as the classic MaxEnt approach. Classic MaxEnt gives point probabilities (subject to the given constraints), rather than probability densities. We develop an alternative approach that assumes that the uncertain constraint values are represented by a probability density (e.g. a Gaussian), and this uncertainty yields a MaxEnt posterior probability density. That is, the classic MaxEnt point probabilities are regarded as a multidimensional function of the given constraint values, and uncertainty on these values is transmitted through the MaxEnt function to give uncertainty over the MaxEnt probabilities. We illustrate this approach by explicitly calculating the generalized MaxEnt density for a simple but common case, then show how this can be extended numerically to the general case. This paper expands the generalized MaxEnt concept introduced in a previous paper.},
	author = {Cheeseman, Peter   and Stutz, John  },
	citeulike-article-id = {591371},
	comment = {Treats parameters as random variables, calculates transformed density using change of variables},
	doi = {10.1063/1.2149816},
	editor = {Knuth, Kevin  H.  and Abbas, Ali  E.  and Morris, Robin  D.  and Castle, Patrick  J. },
	journal = {AIP Conference Proceedings},
	keywords = {maxent},
	number = {1},
	pages = {374--381},
	priority = {2},
	publisher = {AIP},
	title = {Generalized Maximum Entropy},
	url = {http://scitation.aip.org/getabs/servlet/GetabsServlet?prog=normal\&id=APCPCS000803000001000374000001\&idtype=cvips\&gifs=yes},
	volume = {803},
	year = {2005}
}


@article{citeulike:590137,
	abstract = {We discuss the formulation of discrete maximum entropy problems given upper and lower bounds on moments and probabilities. We show that with bounds on discrete probabilities, and bounds on cumulative probabilities, the solution is invariant to any additive concave objective function. This observation simplifies the analysis of the problem and unifies the solution of several generalized entropy expressions. We use this invariance result to provide an exact graphical solution to the maximum entropy distribution between upper and lower cumulative probability bounds. We also discuss the maximum entropy joint distribution with bounds on marginal probabilities and provide a graphical solution to the problem using properties of the entropy expression.},
	author = {Abbas, Ali  E. },
	citeulike-article-id = {590137},
	comment = {Shows that can use any additive convex function on probabilities instead of entropy},
	doi = {10.1063/1.2149777},
	editor = {Knuth, Kevin  H.  and Abbas, Ali  E.  and Morris, Robin  D.  and Castle, Patrick  J. },
	journal = {AIP Conference Proceedings},
	keywords = {maxent, optimization},
	number = {1},
	pages = {25--42},
	priority = {2},
	publisher = {AIP},
	title = {Maximum Entropy Distributions between Upper and Lower Bounds},
	url = {http://scitation.aip.org/getabs/servlet/GetabsServlet?prog=normal\&id=APCPCS000803000001000025000001\&idtype=cvips\&gifs=yes},
	volume = {803},
	year = {2005}
}


@phdthesis{citeulike:587870,
	author = {Badsberg, J. H. },
	citeulike-article-id = {587870},
	comment = {Lauritzen's notation, chapter on closed form solutions to ML estimates in contingency tables},
	institution = {Department of Mathematics and Computer Science, Aalborg University},
	keywords = {contingency, graphical},
	priority = {2},
	title = {An environment for graphical models},
	year = {1995}
}


@article{citeulike:587869,
	abstract = {General theoretical principles that enable the derivation of prior probabilities are of interest both in practical data analysis and, more broadly, in the foundations of probability theory. In this paper, it is shown that the general rule for the assignment of priors proposed by Jeffreys can be obtained from, and is logically equivalent to, an intuitively reasonable information-theoretical invariance principle Some of the implications for the priors proposed by Hartigan, Jaynes, and Skilling, are also discussed.},
	author = {Goyal, Philip  },
	citeulike-article-id = {587869},
	comment = {Derives multinomial Jeffrey's prior as the prior that makes amount of new information provided by an observation the same for all observations.},
	doi = {10.1063/1.2149815},
	editor = {Knuth, Kevin  H.  and Abbas, Ali  E.  and Morris, Robin  D.  and Castle, Patrick  J. },
	journal = {AIP Conference Proceedings},
	keywords = {maxent, prior},
	number = {1},
	pages = {366--373},
	priority = {2},
	publisher = {AIP},
	title = {Prior Probabilities: An Information-Theoretic Approach},
	url = {http://scitation.aip.org/getabs/servlet/GetabsServlet?prog=normal\&id=APCPCS000803000001000366000001\&idtype=cvips\&gifs=yes},
	volume = {803},
	year = {2005}
}


@inbook{citeulike:587868,
	author = {Leonard and Hsu},
	booktitle = {Aspects of Uncertainty. A Tribute to DV Lindley},
	citeulike-article-id = {587868},
	keywords = {bayesian, contingency},
	priority = {2},
	title = {The Bayesian analysis of categorical data–A selective review},
	year = {1994}
}


@article{citeulike:586727,
	abstract = {An attempt is made to determine the logically consistent rules for selecting a vector from any feasible set defined by linear constraints, when either all n-vectors or those with positive components or the probability vectors are permissible. Some basic postulates are satisfied if and only if the selection rule is to minimize a certain function which, if a "prior guess" is available, is a measure of distance from the prior guess. Two further natural postulates restrict the permissible distances to the author's f-divergences and Bregman's divergences, respectively. As corollaries, axiomatic characterizations of the methods of least squares and minimum discrimination information are arrived at. Alternatively, the latter are also characterized by a postulate of composition consistency. As a special case, a derivation of the method of maximum entropy from a small set of natural axioms is obtained.},
	author = {Csiszar, Imre  },
	citeulike-article-id = {586727},
	comment = {Defines unnormalized I-divergence, axioms unifying L2 and I-projections},
	journal = {The Annals of Statistics},
	keywords = {information-geometry, maxent},
	number = {4},
	pages = {2032--2066},
	priority = {2},
	title = {Why Least Squares and Maximum Entropy? An Axiomatic Approach to Inference for Linear Inverse Problems},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28199112\%2919\%3A4\%3C2032\%3AWLSAME\%3E2.0.CO\%3B2-J},
	volume = {19},
	year = {1991}
}


@unpublished{citeulike:587867,
	author = {Csiszar and Shields},
	citeulike-article-id = {587867},
	comment = {Results on I-divergences. MLE as reverse I-divergence, f-divergences},
	keywords = {information-theory, maxent},
	priority = {2},
	title = {Notes on information theory and statistics},
	year = {1989}
}


@article{citeulike:587865,
	abstract = {We derive an equivalence between AdaBoost and the dual of a convex
optimization problem, showing that the only difference between minimizing
the exponential loss used by AdaBoost and maximum likelihood
for exponential models is that the latter requires the model to be normalized
to form a conditional probability distribution over labels. In addition
to establishing a simple and easily understood connection between
the two methods, this framework enables us to derive new regularization
procedures for boosting that directly correspond to penalized maximum
likelihood. Experiments on UCI datasets support our theoretical analysis
and give additional insight into the relationship between boosting and
logistic regression.},
	author = {Lebanon and Lafferty},
	booktitle = {NIPS},
	citeulike-article-id = {587865},
	comment = {Adaboost.M2 loss comes out from dual of the Lagrangian, Logistic loss comes from primal},
	keywords = {boosting, maxent},
	priority = {2},
	title = {Boosting and Maximum Likelihood for Exponential Models},
	year = {2002}
}


@article{citeulike:543355,
	abstract = {We consider the problem of determining the structure of clustered data, without prior knowledge of the number of clusters or any other information about their composition. Data are represented by a mixture model in which each component corresponds to a different cluster. Models with varying geometric properties are obtained through Gaussian components with different parameterizations and cross-cluster constraints. Noise and outliers can be modeled by adding a Poisson process component....},
	author = {Fraley, Chris   and Raftery, Adrian  E. },
	citeulike-article-id = {543355},
	journal = {The Computer Journal},
	keywords = {clustering},
	number = {8},
	pages = {578--588},
	priority = {2},
	title = {How Many Clusters? Which Clustering Method? Answers Via Model-Based Cluster Analysis},
	url = {http://citeseer.ist.psu.edu/fraley98how.html},
	volume = {41},
	year = {1998}
}


@article{citeulike:586570,
	abstract = {Many language processing tasks can be reduced
to breaking the text into segments
with prescribed properties. Such tasks
include sentence splitting, tokenization,
named-entity extraction, and chunking.
We present a new model of text segmentation
based on ideas from multilabel classification.
Using this model, we can naturally
represent segmentation problems involving
overlapping and non-contiguous
segments. We evaluate the model on entity
extraction and noun-phrase chunking
and show that it is more accurate for overlapping
and non-contiguous segments, but
it still performs well on simpler data sets
for which sequential tagging has been the
best method.},
	author = {Mcdonald, Ryan   and Crammer, Koby   and Pereira, Fernando  },
	citeulike-article-id = {586570},
	journal = {EMNLP},
	keywords = {nlp},
	priority = {2},
	title = {Flexible Text Segmentation with Structured Multilabel Classification},
	year = {2005}
}


@inproceedings{citeulike:586535,
	address = {New York, NY, USA},
	author = {Zhu, Shenghuo   and Ji, Xiang   and Xu, Wei   and Gong, Yihong  },
	booktitle = {SIGIR '05: Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval},
	citeulike-article-id = {586535},
	comment = {Add features of the form f\_ij(x,y)=y\_i y\_j to predict multiple labels simultaneously},
	doi = {10.1145/1076034.1076082},
	isbn = {1595930345},
	keywords = {maxent},
	pages = {274--281},
	priority = {2},
	publisher = {ACM Press},
	title = {Multi-labelled classification using maximum entropy method},
	url = {http://portal.acm.org/citation.cfm?id=1076082},
	year = {2005}
}


@article{citeulike:584647,
	address = {Cambridge, MA, USA},
	author = {Bruce, Rebecca  F.  and Wiebe, Janyce  M. },
	citeulike-article-id = {584647},
	comment = {Example of deriving factorization from graph },
	issn = {0891-2017},
	journal = {Comput. Linguist.},
	keywords = {graphical},
	month = {June},
	number = {2},
	pages = {195--207},
	priority = {2},
	publisher = {MIT Press},
	title = {Decomposable modeling in natural language processing},
	url = {http://portal.acm.org/citation.cfm?id=973308},
	volume = {25},
	year = {1999}
}


@article{citeulike:580951,
	abstract = {This paper offers an introduction to Bayesian reference analysis, often described as the more successful method to produce non-subjective, model-based, posterior distributions. The ideas are illustrated in detail with an interesting problem, the ratio of multinomial parameters, for which no model-based Bayesian analysis has been proposed. Signposts are provided to the huge related literature.},
	author = {Bernardo, Jose  M.  and Ramon, Jose  M. },
	citeulike-article-id = {580951},
	comment = {Introduction on Bernardo's reference priors},
	keywords = {estimation, prior},
	priority = {2},
	title = {An Introduction to Bayesian Reference Analysis: Inference on the Ratio of Multinomial Parameters},
	url = {http://links.jstor.org/sici?sici=0039-0526\%281998\%2947\%3A1\%3C101\%3AAITBRA\%3E2.0.CO\%3B2-C}
}


@article{citeulike:578825,
	abstract = {This paper illustrates how the duality theory of geometric programming and the special methods available for solution can be used to obtain maximum likelihood estimates for multinomial probabilities. Three examples are given.},
	author = {Mazumdar, M.  and Jefferson, T. R. },
	citeulike-article-id = {578825},
	journal = {Biometrika},
	keywords = {estimation},
	number = {1},
	pages = {257--261},
	priority = {2},
	title = {Maximum Likelihood Estimates for Multinomial Probabilities via Geometric Programming},
	url = {http://links.jstor.org/sici?sici=0006-3444\%28198304\%2970\%3A1\%3C257\%3AMLEFMP\%3E2.0.CO\%3B2-8},
	volume = {70},
	year = {1983}
}


@article{citeulike:577517,
	abstract = {Thirteen discriminant procedures are compared by applying them to five real sets of binary data and evaluating their leave-one-out error rates. Three versions of each data set have been used, containing respectively \&ldquo;large\&rdquo;, \&ldquo;moderate\&rdquo; and \&ldquo;small\&rdquo; numbers of variables. To achieve the latter two categories, reduction of variables was first carried out using the all-subsets approach based on Kullback's information divergence measure. Sample size, number of non-empty multinomial cells and Empirical Integrated Rank are taken into account in assessment of classifier effectiveness. While the data sets are ones that arose during day-to-day statistical consulting, the empirical basis for drawing widespread conclusions is inevitably limited. Nevertheless, the study did highlight the following interesting features. The Kernel, Fourier and Hall's k-nearest neighbour classifiers had a tendency to overfit the data. The mixed integer programming classifier was clearly better than the other linear classifiers, and linear discriminant analysis had better results than logistic discrimination especially for small sample sizes. The second-order Bahadur procedure was generally very effective when the number of variables was large, but only if the sample size was large when the number of variables was small. The second-order log-linear models were very effective when the number of variables was small or when the sample sizes were large. Quadratic discrimination and Hills\&rsquo; k-nearest neighbour classification both performed poorly. The traditional statistical classifiers did not cope well with sparse binary data; the non-traditional classifiers such as neural networks or mixed integer programming classifiers were much better in such circumstances.},
	author = {Asparoukhov, Ognian  K.  and Krzanowski, Wojtek  J. },
	citeulike-article-id = {577517},
	comment = {Compares 13 methods for classification over binary predictor variables. Linear regression works surprisingly well. Also logistic regression and naive bayes},
	doi = {10.1016/S0167-9473(01)00032-9},
	journal = {Computational Statistics \& Data Analysis},
	keywords = {estimation},
	month = {December},
	number = {2},
	pages = {139--160},
	priority = {2},
	title = {A comparison of discriminant procedures for binary variables},
	url = {http://dx.doi.org/10.1016/S0167-9473(01)00032-9},
	volume = {38},
	year = {2001}
}


@article{citeulike:577496,
	author = {Celeux, Gilles   and Mkhadri, Abdallah  },
	citeulike-article-id = {577496},
	comment = {Connections to Friedman's regularized discriminant analysis to contingency tables},
	journal = {Statistics and Computing},
	keywords = {estimation},
	priority = {2},
	title = {Discrete regularized discriminant analysis},
	year = {1992}
}


@techreport{citeulike:577493,
	abstract = {We propose a nonparametric penalized likelihood approach for variable se- 
lection and model building, called likelihood basis pursuit (LBP). In the 
setting of a tensor product reproducing kernel Hilbert space, we decompose 
the log likelihood into the sum of dierent functional components such as 
main eects and interactions, with each component represented by appropri- 
ate basis functions. The basis functions are chosen to be compatible with 
variable selection and model building in the context of a smoothing spline 
ANOVA model. Basis pursuit is applied to obtain the optimal decomposition 
in terms of having the smallest l 1 norm on the coe{\AE}cients. We use the func- 
tional L 1 norm to measure the importance of each component and determine 
the \threshold" value by a sequential Monte Carlo bootstrap test algorithm. 
As a generalized LASSO-type method, LBP produces shrinkage estimates for 
the coe{\AE}cients, which greatly facilitates the variable selection process, and 
provides highly interpretable multivariate functional estimates at the same 
time. To choose the regularization parameters appearing in the LBP mod- 
els, generalized approximate cross validation (GACV) is derived as a tuning 
criterion. To make GACV widely applicable to large data set situations, its 
randomized version is proposed as well. A technique \slice modeling" is used 
to solve the optimization problem and makes the computation more e{\AE}cient.},
	author = {Zhang, Hao  H. },
	citeulike-article-id = {577493},
	comment = {Builds interaction models in an RKHS},
	keywords = {regularization, svm},
	priority = {2},
	title = {Nonparametric variable selection and model building via likelihood basis pursuit},
	year = {2002}
}


@article{citeulike:577447,
	address = {Amsterdam, The Netherlands, The Netherlands},
	author = {Bull, Shelley  B.  and Mak, Carment   and Greenwood, Celia  M. T. },
	citeulike-article-id = {577447},
	comment = {Bias reduced estimator of canonical parameters},
	doi = {10.1016/S0167-9473(01)00048-2},
	issn = {0167-9473},
	journal = {Comput. Stat. Data Anal.},
	keywords = {estimation},
	month = {March},
	number = {1},
	pages = {57--74},
	priority = {2},
	publisher = {Elsevier Science Publishers B. V.},
	title = {A modified score function estimator for multinomial logistic regression in small samples},
	url = {http://portal.acm.org/citation.cfm?id=584678},
	volume = {39},
	year = {2002}
}


@article{citeulike:575195,
	abstract = {We introduce a plane, which we call the delta-sigma plane, that is indexed by the norm of the estimator bias gradient and the variance of the estimator. The norm of the bias gradient is related to the maximum variation in the estimator bias function over a neighborhood of parameter space. Using a uniform Cramer-Rao (CR) bound on estimator variance, a delta-sigma tradeoff curve is specified that defines an \&ldquo;unachievable region\&rdquo; of the delta-sigma plane for a specified statistical model. In order to place an estimator on this plane for comparison with the delta-sigma tradeoff curve, the estimator variance, bias gradient, and bias gradient norm must be evaluated. We present a simple and accurate method for experimentally determining the bias gradient norm based on applying a bootstrap estimator to a sample mean constructed from the gradient of the log-likelihood. We demonstrate the methods developed in this paper for linear Gaussian and nonlinear Poisson inverse problems},
	author = {Hero, A. O.  and Fessler, J. A.  and Usman, M. },
	citeulike-article-id = {575195},
	journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	keywords = {estimation},
	number = {8},
	pages = {2026--2041},
	priority = {2},
	title = {Exploring estimator bias-variance tradeoffs using the uniform CR bound},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=533723},
	volume = {44},
	year = {1996}
}


@misc{citeulike:573030,
	abstract = {The three main theoretical bases of the concepts of entropy and cross-entropy
- information-theoretic, axiomatic and combinatorial - are critically examined.
It is shown that the combinatorial basis, proposed by Boltzmann and Planck, is
the most fundamental (most primitive) basis of these concepts, since it
provides (i) a derivation of the Kullback-Leibler cross-entropy and Shannon
entropy functions, as simplified forms of the multinomial distribution subject
to the Stirling approximation; (ii) an explanation for the need to maximize
entropy (or minimize cross-entropy) to find the most probable realization; and
(iii) the means to derive entropy and cross-entropy functions for systems which
do not satisfy the multinomial distribution, i.e. which fall outside the domain
of the Kullback-Leibler and Shannon measures. The information-theoretic and
axiomatic bases of cross-entropy and entropy - whilst of tremendous importance
and utility - are therefore seen as secondary viewpoints, which lack the
breadth of the combinatorial approach. Appreciation of this reasoning would
permit development of a powerful body of "combinatorial information theory", as
a tool for statistical inference in all fields (inside and outside science).
The essential features of Jaynes' analysis of entropy and cross-entropy -
reinterpreted in light of the combinatorial approach - are outlined, including
derivation of probability distributions, ensemble theory, Jaynes relations,
fluctuation theory and Jaynes' entropy concentration theorem. New results
include a generalized free energy (or ``free information'') concept, a
generalized Gibbs-Duhem relation and phase rule. Generalized (combinatorial)
definitions of entropy and cross-entropy, valid for any combinatorial system,
are then proposed and examined in detail.},
	author = {Niven, Robert  K. },
	citeulike-article-id = {573030},
	comment = {Derivation of combinatorial justification for maxent},
	eprint = {cond-mat/0512017},
	keywords = {maxent},
	month = {Jan},
	priority = {2},
	title = {Combinatorial Information Theory: I. Philosophical Basis of Cross-Entropy and Entropy},
	url = {http://arxiv.org/abs/cond-mat/0512017},
	year = {2006}
}


@techreport{citeulike:572655,
	abstract = {We present a method for conditional maximum likelihood estimation of Naive Bayes models that employs a well known technique relying on a generalization of the Baum-Eagon inequality from polynomials to rational functions. The main advantage of the procedure is that it keeps the model parameter values (probabilities) properly normalized at each iteration. We apply the model trained under the maximum likelihood and conditional maximum likelihood criteria, respectively, to a text classification problem. A simple modification of the algorithm increases the convergence speed significantly over a straightforward implementation. The model trained under the conditional maximum likelihood criterion achieves a relative improvement of 40\% in classification accuracy over its maximum likelihood counterpart on a text classification task.},
	author = {Chelba, Ciprian   and Acero, A. },
	citeulike-article-id = {572655},
	comment = {Some trick from HMM community to do discriminative training of NB in original parametrization. Non-convex training problem, worse performance than MaxEnt},
	keywords = {generative-discriminative},
	priority = {2},
	title = {Conditional Maximum Likelihood Estimation of Naive Bayes Probability Models Using Rational Function Growth Transform},
	year = {2004}
}


@mastersthesis{citeulike:572512,
	abstract = {The goal of discriminative sequence learning is to learn how to classify items that can be
arranged in a sequence. Many models have been proposed including logistic regression,
the maximum entropy Markov model, the conditional random field, the input output
Markov model, the hidden random field, and template models based on restricted Boltzmann
machines. These models differ along several dimensions: whether they can be
represented by a directed graphical model or an undirected one, whether or not they are
chain structured, whether or not they are fully observed models, and whether or not they
can incorporate knowledge about larger scale label structures. In this work, we compare
these models on several synthetic problems and on a larger information extraction task.},
	author = {Stewart, Liam  },
	citeulike-article-id = {572512},
	comment = {Restricted Boltzman Machines (RBM) for template features over output labels},
	keywords = {crf},
	organization = {University of Toronto},
	priority = {2},
	title = {Structure Learning in Sequential Data},
	year = {2005}
}


@inproceedings{citeulike:572500,
	address = {New York, NY, USA},
	author = {Phan, Xuan-Hieu   and Nguyen, Le-Minh   and Ho, Tu-Bao   and Horiguchi, Susumu  },
	booktitle = {KDD '05: Proceeding of the eleventh ACM SIGKDD international conference on Knowledge discovery in data mining},
	citeulike-article-id = {572500},
	comment = {Uses datamining to find features to use for CRF},
	doi = {10.1145/1081870.1081906},
	isbn = {159593135X},
	keywords = {crf},
	pages = {304--313},
	priority = {2},
	publisher = {ACM Press},
	title = {Improving discriminative sequential learning with rare--but--important associations},
	url = {http://portal.acm.org/citation.cfm?id=1081870.1081906},
	year = {2005}
}


@inproceedings{citeulike:572494,
	abstract = {Discriminative learning techniques for sequential data have
proven to be more effective than generative models for named entity
recognition, information extraction, and other tasks of discrimination.
However, semi-supervised learning mechanisms that utilize inexpensive
unlabeled sequences in addition to few labeled sequences – such as the
Baum-Welch algorithm – are available only for generative models. The
multi-view approach is based on the principle of maximizing the consensus
among multiple independent hypotheses; we develop this principle
into a semi-supervised hidden Markov perceptron, and a semi-supervised
hidden Markov support vector learning algorithm. Experiments reveal
that the resulting procedures utilize unlabeled data effectively and discriminate
more accurately than their purely supervised counterparts.},
	author = {Brefeld and Buscher and Scheffer},
	booktitle = {Proceedings of the European Conference on Machine Learning, 2005},
	citeulike-article-id = {572494},
	comment = {Trains hidden Markov SVM's additionally minimizing disagreement on unlabelled datasets using 2 view learning (motivated by Dasgupta's co-training bounds)},
	keywords = {semisupervised, svm},
	priority = {2},
	title = {Multi-View Discriminative Sequential Learning},
	year = {2005}
}


@techreport{jing05tr,
	author = {Jing, Y.  and Pavlovic, V.  and Rehg, J. M. },
	citeulike-article-id = {572491},
	comment = {Derives boosting to maximize CLL score (upper bound on misclassifications), using Naive Bayes as base classifiers. Extends approach to sequences, gets similar results to CRF, but faster. Good bibliography of discriminative BN training},
	institution = {College of Computing, Georgia Institute of Technology},
	keywords = {boosting, crf},
	number = {GIT-GVU-05-23},
	priority = {2},
	title = {Boosted Bayesian Network Classifiers},
	url = {http://www.cs.rutgers.edu/~vladimir/pub/jing05tr.pdf},
	year = {2005}
}


@misc{citeulike:348725,
	abstract = {This paper investigates a boosting approach to discriminative

learning of label sequences based on a sequence rank loss function.},
	author = {Altun, Y.  and Hofmann, T.  and Johnson, M. },
	citeulike-article-id = {348725},
	comment = {boosting derivation for log loss for sequences, sparser models than gradient descent, but worse performance},
	keywords = {boosting, crf},
	priority = {2},
	title = {Discriminative learning for label sequences via boosting},
	url = {http://citeseer.ist.psu.edu/altun02discriminative.html},
	year = {2003}
}


@misc{citeulike:572465,
	abstract = {Discriminative models have been of interest

in the NLP community in recent years.},
	author = {Altun, Yasemin   and Johnson, Mark   and Hofmann, Thomas  },
	citeulike-article-id = {572465},
	comment = {Tries sequence-wise/point-wise log-loss/exponential loss, finds not much difference in performance. Also compares gradient descent method to perceptron and boosting},
	keywords = {crf, generative-discriminative},
	priority = {2},
	title = {Investigating Loss Functions and Optimization Methods for Discriminative Learning of Label Sequences},
	url = {http://citeseer.ist.psu.edu/altun03investigating.html}
}


@article{citeulike:572046,
	abstract = {We consider the idea of combining the key advantages of Bayesian networks and of kernel-based learning systems. In connection with two-label classification tasks over the Boolean domain, we study the question whether the class of decision functions induced by a given Bayesian network can be represented within a low-dimensional inner product space. For Bayesian networks with an explicitly given (full or reduced) parameter collection, we establish tight bounds on the dimension of the ``natural'' inner product space. Further, we consider a variant of the logistic autoregressive Bayesian network and show that every sufficiently expressive inner product space must have dimension at least \$2^{\Omega(n)}\$, where \$n\$ is the number of network nodes. As the main technical contribution, this work reveals combinatorial and algebraic structures within Bayesian networks such that known techniques for proving lower bounds on the dimension of inner product spaces can be brought into play.},
	author = {Nakamura and Schmitt and Schmitt and Simon},
	citeulike-article-id = {572046},
	journal = {JMLR},
	keywords = {graphical, kernel},
	priority = {2},
	title = {Inner product spaces for Bayesian networks},
	year = {2006}
}


@inproceedings{Garcia04,
	address = {Arlington, Virginia},
	author = {Garcia, Luis  D. },
	booktitle = {Proceedings of the 20th Annual Conference on Uncertainty in Artificial Intelligence (UAI-04)},
	citeulike-article-id = {572005},
	comment = {Some algebra facts on "effective dimension" of simple Bayes Nets with hidden variables},
	keywords = {algebra, graphical},
	pages = {177--184},
	priority = {2},
	publisher = {AUAI Press},
	title = {Algebraic Statistics in Model Selection},
	year = {2004}
}


@phdthesis{citeulike:572003,
	author = {Ali, Rebecca  },
	citeulike-article-id = {572003},
	comment = {Introduces some Markov equivalence results, also results for chain graphs, ancestral graphs},
	keywords = {graphical},
	organization = {University of Washington},
	priority = {2},
	title = {Applying Graphical Models to Partially Observed Data Generating Processes},
	year = {2002}
}


@inproceedings{citeulike:572000,
	abstract = {Learning the parameters of an undirected graphi-
cal model is particularly difficult due to the pres-
ence of a global normalization constant. For large
unstructured models computing the gradient of the
log-likelihood is intractable and approximations be-
come necessary. Several approximate learning al-
gorithms have been proposed in the literature but
a thorough comparative study seems to be absent.
In this paper we report on the results of a series
of experiments which compare a number of learning
algorithms on several models. In our experimental
design we use perfect sampling techniques in order to
be able to assess quantities such as (asymptotic) nor-
mality, bias and variance of the estimates. We envi-
sion this effort as a first step towards a more compre-
hensive open source testing environment where re-
searchers can submit learning algorithms and bench-
mark problems},
	author = {Parise and Welling},
	citeulike-article-id = {572000},
	comment = {Compares Besag's pseudo-likelihood, pseudo-moment matching, Hinton's contrastive divergence for Ising-like models. CD works best for dense graphs, PMM works best for sparse},
	keywords = {estimation, graphical},
	priority = {2},
	title = {Learning in Markov Random Fields: An Empirical Study},
	year = {2005}
}


@techreport{citeulike:571222,
	author = {Meek},
	citeulike-article-id = {571222},
	comment = {Gives condition for equivalence of directed and undirected (no unshielded colliders, positive)},
	keywords = {graphical},
	priority = {2},
	title = {Relating Graphical Frameworks: Undirected, Directed Acyclic and Chain Graph Models},
	year = {1995}
}


@techreport{citeulike:571219,
	author = {Fletch and Lucas},
	citeulike-article-id = {571219},
	comment = {Need to know skeleton and set of unshielded colliders, suggests searching in the space of equivalence classes rather than the space of graphs},
	priority = {2},
	title = {Markov Equivalence in Bayesian Networks},
	year = {2004}
}


@article{citeulike:568511,
	abstract = {This paper considers a wide class of latent structure models. These models can serve as possible explanations of the observed relationships among a set of m manifest polytomous variables. The class of models considered here includes both models in which the parameters are identifiable and also models in which the parameters are not. For each of the models considered here, a relatively simple method is presented for calculating the maximum likelihood estimate of the frequencies in the m-way contingency table expected under the model, and for determining whether the parameters in the estimated model are identifiable. In addition, methods are presented for testing whether the model fits the observed data, and for replacing unidentifiable models that fit by identifiable models that fit. Some illustrative applications to data are also included.},
	author = {Goodman, Leo  A. },
	citeulike-article-id = {568511},
	comment = {"For instance, statisticians have known for decades that the dimension of a naive Bayes model can unexpectedly drop
(e.g., Goodman 1974)"},
	journal = {Biometrika},
	keywords = {estimation, graphical},
	number = {2},
	pages = {215--231},
	priority = {2},
	title = {Exploratory Latent Structure Analysis Using Both Identifiable and Unidentifiable Models},
	url = {http://links.jstor.org/sici?sici=0006-3444\%28197408\%2961\%3A2\%3C215\%3AELSAUB\%3E2.0.CO\%3B2-F},
	volume = {61},
	year = {1974}
}


@article{citeulike:568504,
	abstract = {This paper introduces a class of graphical independence models that is closed under marginalization and conditioning but that contains all DAG independence models. This class of graphs, called maximal ancestral graphs, has two attractive features: there is at most one edge between each pair of vertices; every missing edge corresponds to an independence relation. These features lead to a simple parameterization of the corresponding set of distributions in the Gaussian case.},
	author = {Richardson, Thomas   and Spirtes, Peter  },
	citeulike-article-id = {568504},
	comment = {generalizes directed and undirected, closed under marginalization},
	journal = {The Annals of Statistics},
	keywords = {graphical},
	number = {4},
	pages = {962--1030},
	priority = {2},
	title = {Ancestral Graph Markov Models},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28200208\%2930\%3A4\%3C962\%3AAGMM\%3E2.0.CO\%3B2-J},
	volume = {30},
	year = {2002}
}


@unpublished{citeulike:568501,
	abstract = {There has been significant interest in developing new forms of
acoustic model, in particular models which allow additional dependencies
to be represented than allowed within a standard hidden
Markov model (HMM). This paper discusses one such class
of models, augmented statistical models. Here a locally exponential
approximation is made about some point on a base distribution.
This allows additional dependencies within the data to be modelled
than are represented in the base distribution. Augmented models
based on Gaussian mixture models (GMMs) and HMMs are briefly
described. These augmented models are then related to generative
kernels, one approach used for allowing support vector machines
(SVMs) to be applied to variable length data. The training
of augmented statistical models within an SVM, generative kernel,
framework is then discussed. This may be viewed as using maximum
margin training to estimate statistical models. Augmented
Gaussian mixture models are then evaluated using rescoring on a
large vocabulary speech recognition task.},
	author = {Gales and Layton},
	citeulike-article-id = {568501},
	comment = {For every point of the family, finds a tangent exponential family and throws it into the mix, resuling in "augmented statistical models", also slides from NIPS 2005 workshop},
	keywords = {exponential-families, generative-discriminative, svm},
	priority = {2},
	title = {SVM's, score-spaces and maximum margin statistical models},
	year = {2005}
}


@article{citeulike:566135,
	abstract = {The problems of existence, uniqueness and location of maximum likelihood estimates in log linear models have received special attention in the literature (Haberman, 1974, Chapter 2; Wedderburn, 1976; Silvapulle, 1981). For multinomial logistic regression models, we prove existence theorems by considering the possible patterns of data points, which fall into three mutually exclusive and exhaustive categories: complete separation, quasicomplete separation and overlap. Our results suggest general rules for identifying infinite parameter estimates in log linear models for frequency tables.},
	author = {Albert, A.  and Anderson, J. A. },
	citeulike-article-id = {566135},
	journal = {Biometrika},
	keywords = {estimation},
	number = {1},
	pages = {1--10},
	priority = {2},
	title = {On the Existence of Maximum Likelihood Estimates in Logistic Regression Models},
	url = {http://links.jstor.org/sici?sici=0006-3444\%28198404\%2971\%3A1\%3C1\%3AOTEOML\%3E2.0.CO\%3B2-N},
	volume = {71},
	year = {1984}
}


@article{citeulike:541115,
	abstract = {We provide a detailed, introductory exposition of the
Metropolis-Hastings algorithm, a powerful Markov chain
method to simulate multivariate distributions. A simple,
intuitive derivation of this method is given along
with guidance on implementation. Also discussed are
two applications of the algorithm, one for implementing
acceptance-rejection sampling when a blanketing function
is not available and the other for implementingthe algorithm
with block-at-a-time scans. In the latter situation
many different algorithms, including the Gibbs sampler
are shown to be special cases of the Metropolis-Hastings
algorithm. The methods are ilustrated with examples.},
	author = {Chib},
	citeulike-article-id = {541115},
	journal = {American Statistician},
	priority = {2},
	title = {Understanding the Metropolis-Hastings Algorithm},
	year = {1989}
}


@inproceedings{citeulike:541067,
	address = {New York, NY, USA},
	author = {Lebanon, Guy   and Lafferty, John  },
	booktitle = {ICML '04: Proceedings of the twenty-first international conference on Machine learning},
	citeulike-article-id = {541067},
	comment = {Multinomial vs euclidian logistic regression},
	doi = {10.1145/1015330.1015333},
	isbn = {1581138285},
	keywords = {information-geometry},
	priority = {2},
	publisher = {ACM Press},
	title = {Hyperplane margin classifiers on the multinomial manifold},
	url = {http://portal.acm.org/citation.cfm?id=1015333},
	year = {2004}
}


@article{citeulike:540879,
	author = {Klauer},
	citeulike-article-id = {540879},
	comment = {Gives a criterion to tell that a family of distributions is not exponential},
	keywords = {exponential-families},
	priority = {2},
	title = {Non-exponential families of distributions}
}


@article{citeulike:540878,
	abstract = {Let \$φ\$ be a parameter for which there is no unbiased estimator. This note shows that for an arbitrary sequence of estimators \$T^(k)\$, if the biases of \$T^(k)\$ tend to 0 then their variances must tend to \$∞\$.},
	author = {Doss and Sethuraman},
	citeulike-article-id = {540878},
	keywords = {estimation},
	priority = {2},
	title = {The Price of Bias Reduction when there is no Unbiased Estimate},
	url = {http://links.jstor.org/sici?sici=0090-5364(198903)17:1\&\#60;440:TPOBRW\&\#62;2.0.CO;2-4\&\#38;size=LARGE},
	year = {1989}
}


@article{citeulike:539282,
	author = {Mcullagh},
	citeulike-article-id = {539282},
	keywords = {estimation, statistics},
	priority = {2},
	title = {Conditional Inference and Cauchy Models},
	url = {http://links.jstor.org/sici?sici=0006-3444(199206)79:2\&\#60;247:CIACM\&\#62;2.0.CO;2-4}
}


@unpublished{citeulike:539144,
	abstract = {It has been realised for several decades now, probably since Efron’s paper introducing the
concept of statistical curvature [Efr75], that most of the main concepts and methods of differential
geometry are of substantial interest in connection with the theory of statistical inference.
This report describes in simple cases the links existing between the two theories. It is based
on an article introducing the topic, by R. Kass [Kas89]. The focus is on parametric statistical
models.},
	author = {Nye, Jerome  },
	citeulike-article-id = {539144},
	comment = {Term report on differential geometry in statistics},
	keywords = {exponential-families, information-geometry},
	priority = {2},
	title = {Statistics and Differential Geometry}
}


@article{citeulike:535442,
	abstract = {Recent research has found that diagnostic performance
with Bayesian belief networks is often
surprisingly insensitive to imprecision in the
numerical probabilities. For example, the authors
have recently completed an extensive study in
which they applied random noise to the numerical
probabilities in a set of belief networks for
medical diagnosis, subsets of the CPCS network,
a subset of the QMR (Quick Medical Reference)
focused on liver and bile diseases. The diagnostic
performance in terms of the average probabilities
assigned to the actual diseases showed small sensitivity
even to large amounts of noise. In this
paper, we summarize the findings of this study
and discuss possible explanations of this low sensitivity.
One reason is that the criterion for performance
is average probability of the true
hypotheses, rather than average error in probability,
which is insensitive to symmetric noise distributions.
But, we show that even asymmetric,
logodds-normal noise has modest effects. A second
reason is that the gold-standard posterior
probabilities are often near zero or one, and are
little disturbed by noise.},
	author = {Henrion and Pradhan and Huang and Provan and O'Rorke},
	citeulike-article-id = {535442},
	keywords = {bayesnet},
	priority = {2},
	title = {Why is diagnosis using belief networks insensitive to imprecision in probabilities?}
}


@article{citeulike:532296,
	author = {Koopman},
	citeulike-article-id = {532296},
	keywords = {exponential-families},
	priority = {2},
	title = {On Distributions Admitting A Sufficient Statistic},
	year = {1936}
}


@article{citeulike:526228,
	abstract = {In this article we consider the problem of numerical
integration over the d-dimensional unit
cube [0, 1]d. If d = 1 and the integrand is sufficiently
smooth, then the integral can be evaluated
easily by, say, Simpson’s rule, in which case the
error of an n-point rule, with n odd, is of order
O(n−4) . When d is 2 or more, the most obvious
strategy is to apply a rule such as Simpson’s rule
in each dimension, creating what is called a product
rule. But now we meet the curse of dimensionality:
the total number of points at which the
integrand must be evaluated (which we may take
as the cost) is N = nd. And with what error? Even
if the integrand is an innocuous function of only
the first component, x1, for example x41
, the resulting
error for the product Simpson rule is clearly
still of order O(n−4) , since from the point of view
of this integrand the integration rule is still the
n-point Simpson’s rule},
	author = {Kuo and Sloan},
	citeulike-article-id = {526228},
	journal = {Notices of the AMS},
	keywords = {kernel},
	month = {December},
	priority = {2},
	title = {Lifting the curse of dimensionality},
	year = {2005}
}


@unpublished{citeulike:526186,
	abstract = {These notes describe Contrastive Divergence (CD), an approximate Maximum-Likelihood (ML) learning algorithm proposed by Geoffrey Hinton.},
	author = {Woodfort, Oliver  },
	citeulike-article-id = {526186},
	comment = {Three page overview of CD},
	keywords = {approximate, exponential-families},
	priority = {2},
	title = {Notes on Contrastive Divergence}
}


@article{citeulike:519599,
	address = {Hingham, MA, USA},
	author = {Shafer, Glenn  },
	citeulike-article-id = {519599},
	comment = {Brief overview of different kinds of graphical models, and work on algebra of independence structures},
	doi = {10.1023/A:1018996831193},
	issn = {1012-2443},
	journal = {Annals of Mathematics and Artificial Intelligence},
	keywords = {algebra, graphical},
	number = {1},
	pages = {1--11},
	priority = {2},
	publisher = {Kluwer Academic Publishers},
	title = {Advances in the understanding and use of conditional independence},
	url = {http://portal.acm.org/citation.cfm?id=590159.590194},
	volume = {21},
	year = {1997}
}


@misc{citeulike:519594,
	abstract = {Introduction

Let N be a finite set and S(N) the family of all couples (ijjK) where K ae N and ij

is the union of two, not necessarily different, singletons i and j of N \Gamma K. Elements and
singletons of N are not distinguished and the unions of subsets of N are written simply
as juxtapositions. Having a system of random variables ¸ = (¸ i ) i2N with subsystems

¸ K = (¸ k ) k2K , K ae N , we introduce the notation

j[¸]j = f(ijjK) 2 S(N ); ¸ :\&lt;F45.},
	author = {Matus, F.  and Studeny, M. },
	citeulike-article-id = {519594},
	keywords = {algebra, graphical},
	priority = {2},
	title = {Conditional Independences Among Four Random Variables I.},
	url = {http://citeseer.ist.psu.edu/67156.html}
}


@article{citeulike:516738,
	abstract = {Augments conditional model with marginals, and fits data in EM-like procedure},
	author = {Edwards and Lauritzen},
	citeulike-article-id = {516738},
	keywords = {exponential-families},
	priority = {2},
	title = {The ME algorithm for maximizing a conditional likelihood function}
}


@misc{citeulike:516735,
	abstract = {Traditional graphical models are extended by allowing that the presence or absence of a connection between two nodes depends on the values of the remaining variables. We first compare the extended model to the classical log-linear model. After discussing the induced consistency problem we illustrate the corresponding estimation problem by way of an example.},
	author = {Teugels, Jozef  L.  and Van Horebeek, Johan  },
	citeulike-article-id = {516735},
	comment = {Allows independencies conditioned on outcome},
	keywords = {graphical},
	priority = {2},
	title = {Generalized Graphical Models for Discrete Data},
	url = {http://citeseer.ist.psu.edu/335255.html}
}


@article{citeulike:516702,
	abstract = {This paper discuss a geometry associated with U-divergence including ideas of U-models,
U-loss functions of two versions. On the basis of the geometry we observe that U-divergence
projection of a data distribution p onto U-model MU associates the Pythagorean relation for
the triangle connection of p q and q∗, for any q of the U-model where q∗ denotes the point of
MU projected from p. This geometric consideration is implemented on the problem of statistical
pattern recognition. U-Boost algorithm proposed in the practical application is shown
to pursue iteratively the U-divergence projection onto U-model evolving by one dimension according
to one iteration. In particular U-Boost algorithm released to the probability constraint
reveals a novel property of statistical property beyond the notion of Fisher consistency, which
helps us to understand the statistical meaning of AdaBoost.},
	author = {Eguchi, Shinto  },
	citeulike-article-id = {516702},
	keywords = {boosting, information-geometry},
	priority = {2},
	title = {Information Geometry and Statistical Pattern Recognition}
}


@techreport{citeulike:516572,
	author = {Jordan and Wainwright},
	citeulike-article-id = {516572},
	comment = {100 page introduction into exponential families, convexity etc},
	keywords = {approximate, exponential-families},
	priority = {2},
	title = {Graphical models, exponential families, and variational inference},
	year = {2003}
}


@article{citeulike:516570,
	address = {Hingham, MA, USA},
	author = {Azoury, Katy  S.  and Warmuth, M. K. },
	citeulike-article-id = {516570},
	comment = {Introduces Bregman divergences, facts about exp family},
	doi = {10.1023/A:1010896012157},
	issn = {0885-6125},
	journal = {Mach. Learn.},
	keywords = {exponential-families},
	month = {June},
	number = {3},
	pages = {211--246},
	priority = {2},
	publisher = {Kluwer Academic Publishers},
	title = {Relative Loss Bounds for On-Line Density Estimation with the Exponential Family of Distributions},
	url = {http://portal.acm.org/citation.cfm?id=599643},
	volume = {43},
	year = {2001}
}


@article{citeulike:515215,
	author = {Seon},
	citeulike-article-id = {515215},
	comment = {Thesis proposal, few facts about information geometry},
	keywords = {information-geometry},
	priority = {2},
	title = {The Information Geometry of Hierarchical Bayesian Models}
}


@book{citeulike:515237,
	author = {Pachter and Sturmfels},
	citeulike-article-id = {515237},
	comment = {Big book, articles from many authors},
	keywords = {algebra},
	priority = {2},
	title = {Algebraic Statistics for Computational Biology},
	url = {http://www.math.sunysb.edu/~jessie/toric\%20geometry/BIO/bio.pdf}
}


@article{citeulike:514100,
	abstract = {When considering sampling models described by a distribution
from an exponential family, it is possible to create
two types of imprecise probability models. One is based
on the corresponding conjugate distribution and the other
on the corresponding predictive distribution. In this paper,
we show how these types of models can be constructed for
any (regular, linear, canonical) exponential family, such as
the centered normal distribution.
To illustrate the possible use of such models, we take a
look at credal classification. We show that they are very
natural and potentially promising candidates for describing
the attributes of a credal classifier, also in the case
of continuous attributes.},
	author = {Quaeghebeur},
	citeulike-article-id = {514100},
	comment = {Has table of common exponential families},
	keywords = {exponential-families},
	priority = {2},
	title = {Imprecise probability models for inference in exponential families}
}


@article{citeulike:440829,
	citeulike-article-id = {440829},
	keywords = {bayesian, prior},
	priority = {2},
	title = {Noninformative Priors Do Not Exist:
A Discussion with Jose M. Bernardo}
}


@article{citeulike:432154,
	citeulike-article-id = {432154},
	priority = {2},
	title = {minimax entropy}
}


@article{citeulike:432153,
	citeulike-article-id = {432153},
	keywords = {information-geometry, model-selection},
	priority = {2},
	title = {Min-Max Kullback-Leibler Model Selection 1}
}


@article{citeulike:430252,
	citeulike-article-id = {430252},
	keywords = {generative-discriminative},
	priority = {2},
	title = {A Generative-Discriminative Hybrid for Sequential Data Classification}
}


@article{citeulike:420572,
	abstract = {Measurements of geometric primitives are often noisy in real applications and we need to use statistics either to reduce the uncertainty (estimation), to compare measurements, or to test hypotheses. Unfortunately, geometric primitives often belong to manifolds that are not vector spaces. In previous works , we used invariance requirements to develop some basic probability tools on transformation groups and homogeneous manifolds that avoids paradoxes. In this paper, we consider the Riemannian metric as the basic structure for the manifold. Based on this metric, we develop the notions of mean value and covariance matrix of a random element, normal law, Mahalanobis distance and ^2 test. We provide a simple (but highly non trivial) characterization of Karcher means and an original gradient descent algorithm to efficiently compute them. The notion of Normal law we propose is based on the the minimization of the information knowing the mean and covariance of the distribution. The resulting family of pdfs spans the whole range from uniform (on compact manifolds) to the point mass distribution. Moreover, we were able to provide tractable approximations (with their limits) for small variances which show that we can effectively implement and work with these definitions. To come back to more practical cases, we then reconsider the case of connected Lie groups and homogeneous manifolds. In our Riemannian context, we investigate the use of invariance principles to choose the metric: we show that it can provide the stability of our statistical definitions w.r.t. geometric operations (composition, inversion and action of transformations). However, an invariant metric does not always exists for homogeneous manifolds, nor does a left and right invariant metric for non-compact Lie groups. In this case, we cannot guaranty the full consistency of geometric and statistical operations. Thus, future work will have to concentrate on constraints weaker than invariance.},
	author = {Pennec},
	citeulike-article-id = {420572},
	comment = {Cool pictures},
	keywords = {information-geometry},
	priority = {2},
	title = {Probabilities and Statistics on Riemannian Manifolds : A Geometric approach}
}


@article{citeulike:419001,
	abstract = {In the literature there have been many suggestions on how to parametrize models. Some properties you can seek are (1) stability of variance of the MLE; (2) normal likelihood; (3) zero asymptotic skewness of the MLE; (4) asymptotic unbiasedness of the MLE. The parametrizations corresponding to these demands are found in the one-dimensional curved exponential family. They all belong to a general class of transformations, but they are in general not identical. The transformations in this class are characterized by a differential equation. The transformations are identical in the non-linear normal regression model.},
	author = {Hougaard, Philip  },
	citeulike-article-id = {419001},
	keywords = {parametrization},
	priority = {2},
	title = {Parametrizations of Non-Linear Models},
	url = {http://links.jstor.org/sici?sici=0035-9246\%281982\%2944\%3A2\%3C244\%3APONM\%3E2.0.CO\%3B2-X}
}


@article{citeulike:418922,
	abstract = {We develop a uniform Cramer-Rao lower bound (UCRLB) on the total variance of any estimator of an unknown vector of parameters, with bias gradient matrix whose norm is bounded by a constant. We consider both the Frobenius norm and the spectral norm of the bias gradient matrix, leading to two corresponding lower bounds. We then develop optimal estimators that achieve these lower bounds. In the case in which the measurements are related to the unknown parameters through a linear Gaussian model, Tikhonov regularization is shown to achieve the UCRLB when the Frobenius norm is considered, and the shrunken estimator is shown to achieve the UCRLB when the spectral norm is considered. For more general models, the penalized maximum likelihood (PML) estimator with a suitable penalizing function is shown to asymptotically achieve the UCRLB. To establish the asymptotic optimality of the PML estimator, we first develop the asymptotic mean and variance of the PML estimator for any choice of penalizing function satisfying certain regularity constraints and then derive a general condition on the penalizing function under which the resulting PML estimator asymptotically achieves the UCRLB. This then implies that from all linear and nonlinear estimators with bias gradient whose norm is bounded by a constant, the proposed PML estimator asymptotically results in the smallest possible variance.},
	author = {Eldar, Y. C. },
	citeulike-article-id = {418922},
	comment = {Finds estimator that uniformly minimizes variance subject to bound constraint on the bias gradient},
	journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	keywords = {estimation},
	number = {7},
	pages = {1915--1930},
	priority = {2},
	title = {Minimum variance in biased estimation: bounds and asymptotically optimal estimators},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1306646},
	volume = {52},
	year = {2004}
}


@article{MR2032352,
	abstract = {Standard tools for the analysis of the (average) conditional
association structure of the distribution on a multiway contingency table
are log-linear models. A different association concept is that of marginal
association and this paper demonstrates how marginal log-linear parameters
can be used to measure this aspect of association. The paper gives a
non-technical discussion of these two aspects of association by discussing
their complementary nature and also describes how conditional association
is naturally incorporated in the framework provided by marginal
log-linear parameters. The properties and interpretation of these parameters
are discussed, including the variation independence of hierarchically
related marginal log-linear parameters, and the modeling implications of
these results are indicated.},
	author = {Bergsma, Wicher  P.  and Rudas, Tam\'{a}s  },
	citeulike-article-id = {418880},
	comment = {For non-decomposable model parameters may lie in individual ranges, but not in joint range. Mixed parametrizations},
	journal = {Ann. Fac. Sci. Toulouse Math. (6)},
	keywords = {loglinear},
	number = {4},
	pages = {455--468},
	priority = {2},
	title = {Modeling conditional and marginal association in contingency tables},
	volume = {11},
	year = {2002}
}


@book{citeulike:418876,
	abstract = {{This book is for people who want to learn probability and statistics quickly. It brings together many of the main ideas in modern statistics in one place. The book is suitable for students and researchers in statistics, computer science, data mining and machine learning.  This book covers a much wider range of topics than a typical introductory text on mathematical statistics. It includes modern topics like nonparametric curve estimation, bootstrapping and classification, topics that are usually relegated to follow-up courses. The reader is assumed to know calculus and a little linear algebra. No previous knowledge of probability and statistics is required. The text can be used at the advanced undergraduate and graduate level.}},
	author = {Wasserman, Larry  },
	citeulike-article-id = {418876},
	howpublished = {Hardcover},
	isbn = {0387402721},
	keywords = {statistics},
	month = {September},
	priority = {2},
	publisher = {Springer},
	title = {All of Statistics : A Concise Course in Statistical Inference (Springer Texts in Statistics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387402721},
	year = {2004}
}


@article{citeulike:418573,
	address = {Cambridge, MA, USA},
	author = {Barber, David   and Saad, David  },
	citeulike-article-id = {418573},
	comment = {Finds that sometimes restricting version space doesn't improve generalization error},
	issn = {0899-7667},
	journal = {Neural Comput.},
	keywords = {ann, ill},
	month = {January},
	number = {1},
	pages = {202--214},
	priority = {2},
	publisher = {MIT Press},
	title = {Does extra knowledge necessarily improve generalization?},
	url = {http://portal.acm.org/citation.cfm?id=218187.218208},
	volume = {8},
	year = {1996}
}


@misc{citeulike:416514,
	abstract = {In order to analyze and extract different structural properties of
distributions, one can introduce different coordinate systems over the manifold
of distributions. In Evolutionary Computation, the Walsh bases and the Building
Block Bases are often used to describe populations, which simplifies the
analysis of evolutionary operators applying on populations. Quite independent
from these approaches, information geometry has been developed as a geometric
way to analyze different order dependencies between random variables (e.g.,
neural activations or genes).


In these notes I briefly review the essentials of various coordinate bases
and of information geometry. The goal is to give an overview and make the
approaches comparable. Besides introducing meaningful coordinate bases,
information geometry also offers an explicit way to distinguish different order
interactions and it offers a geometric view on the manifold and thereby also on
operators that apply on the manifold. For instance, uniform crossover can be
interpreted as an orthogonal projection of a population along an m-geodesic,
monotonously reducing the theta-coordinates that describe interactions between
genes.},
	author = {Toussaint, Marc  },
	citeulike-article-id = {416514},
	comment = {Obtaining new parametrizations, Amari's mixed coordinates},
	eprint = {nlin/0408040},
	keywords = {information-geometry},
	month = {Aug},
	priority = {2},
	title = {Notes on information geometry and evolutionary processes},
	url = {http://arxiv.org/abs/nlin/0408040},
	year = {2004}
}


@article{citeulike:416649,
	abstract = {This review provides a comprehensive understanding of regularization theory from different perspectives, emphasizing smoothness and simplicity principles. Using the tools of operator theory and Fourier analysis, it is shown that the solution of the classical Tikhonov regularization problem can be derived from the regularized functional defined by a linear differential (integral) operator in the spatial (Fourier) domain. State-of-the-art research relevant to the regularization theory is reviewed, covering Occam's razor, minimum length description, Bayesian theory, pruning algorithms, informational (entropy) theory, statistical learning theory, and equivalent regularization. The universal principle of regularization in terms of Kolmogorov complexity is discussed. Finally, some prospective studies on regularization theory and beyond are suggested.},
	address = {Cambridge, MA, USA},
	author = {Chen, Zhe   and Haykin, Simon  },
	citeulike-article-id = {416649},
	doi = {10.1162/089976602760805296},
	issn = {0899-7667},
	journal = {Neural Comput.},
	keywords = {regularization},
	month = {December},
	number = {12},
	pages = {2791--2846},
	priority = {2},
	publisher = {MIT Press},
	title = {On different facets of regularization theory},
	url = {http://portal.acm.org/citation.cfm?id=776153},
	volume = {14},
	year = {2002}
}


@article{citeulike:416492,
	abstract = {We compare four nonlinear methods on their ability to learn models from data. The problem requires predicting whether a company will deliver an earnings surprise a specific number of days prior to announcement. This problem has been well studied in the literature using linear models. A basic question is whether machine learning-based nonlinear models such as tree induction algorithms, neural networks, naive Bayesian learning, and genetic algorithms perform better in terms of predictive accuracy and in uncovering interesting relationships among problem variables. Equally importantly, if these alternative approaches perform better, why? And how do they stack up relative to each other? The answers to these questions are significant for predictive modeling in the financial arena, and in general for problem domains characterized by significant nonlinearities. In this paper, we compare the four above-mentioned nonlinear methods along a number of criteria. The genetic algorithm turns out to have some advantages in finding multiple \&ldquo;small disjunct\&rdquo; patterns that can be accurate and collectively capable of making predictions more often than its competitors. We use some of the nonlinearities we discovered about the problem domain to explain these results},
	author = {Dhar, V.  and Chou, D. },
	citeulike-article-id = {416492},
	comment = {Predicting stocks based on analysts predictions. GA does the best, also trees.},
	journal = {Neural Networks, IEEE Transactions on},
	number = {4},
	pages = {907--921},
	priority = {2},
	title = {A comparison of nonlinear methods for predicting earnings surprises and returns},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=935099},
	volume = {12},
	year = {2001}
}


@inproceedings{citeulike:411566,
	abstract = {Gradient adaptation is a useful technique for adjusting a set of parameters to minimize a cost function. While often easy to implement, the convergence speed of gradient adaptation can be slow when the slope of the cost function varies widely for small changes in the parameters. In this paper, we outline an alternative technique, termed natural gradient adaptation, that overcomes the poor convergence properties of gradient adaptation in many cases. The natural gradient is based on differential geometry and employs knowledge of the Riemannian structure of the parameter space to adjust the gradient search direction. Unlike Newton's method, natural gradient adaptation does not assume a locally-quadratic cost function. Moreover, for maximum likelihood estimation tasks, natural gradient adaptation is asymptotically Fisher-efficient. A simple example illustrates the desirable properties of natural gradient adaptation},
	author = {Amari, S.  and Douglas, S. C. },
	citeulike-article-id = {411566},
	comment = {Since natural gradient and Newton's method maximize same objective, are both of them asymptotically Fisher efficient?},
	journal = {Acoustics, Speech, and Signal Processing, 1998. ICASSP '98. Proceedings of the 1998 IEEE International Conference on},
	keywords = {optimization},
	pages = {1213--1216 vol.2},
	priority = {2},
	title = {Why natural gradient?},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=675489},
	volume = {2},
	year = {1998}
}


@inproceedings{citeulike:411004,
	abstract = {Generally, learning is performed so as to minimize the sum of squared errors between network outputs and training data. Unfortunately, this procedure does not necessarily give us a network with good generalization ability when the number of connection weights are relatively large. In such situation, overfitting to the training data occurs. To overcome this problem: there are several approaches such as regularization learning and early stopping. It has been suggested that these two methods are closely related. In this article, we firstly give an unified interpretation for the relationship between two methods through the analysis of linear networks in the context of statistical regression; i.e. linear regression model. On the other hand, several theoretical works have been done on the optimal regularization parameter and the optimal stopping time. Here, we also consider the problem from the unified viewpoint mentioned above. This analysis enables us to understand the statistical meaning of the optimality. Then, the estimates of the optimal regularization parameter and the optimal stopping time are present and those are examined by simple numerical simulations. Moreover, for the choice of regularization parameter, the relationship between the Bayesian framework and the generalization error minimization framework is discussed},
	author = {Hagiwara, K.  and Kuno, K. },
	citeulike-article-id = {411004},
	comment = {Derives optimal regularizer weight when true function is known.},
	journal = {Neural Networks, 2000. IJCNN 2000, Proceedings of the IEEE-INNS-ENNS International Joint Conference on},
	keywords = {regularization},
	pages = {511--516 vol.4},
	priority = {2},
	title = {Regularization learning and early stopping in linear networks},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=860822},
	volume = {4},
	year = {2000}
}


@article{citeulike:410993,
	address = {Cambridge, MA, USA},
	author = {Barber, David   and Saad, David  },
	citeulike-article-id = {410993},
	issn = {0899-7667},
	journal = {Neural Comput.},
	keywords = {ill, statistical-learning-theory},
	month = {January},
	number = {1},
	pages = {202--214},
	priority = {2},
	publisher = {MIT Press},
	title = {Does extra knowledge necessarily improve generalization?},
	url = {http://portal.acm.org/citation.cfm?id=218208},
	volume = {8},
	year = {1996}
}


@article{citeulike:410992,
	abstract = {			Several studies have shown that natural gradient descent for on-line learning is much more efficient than standard gradient descent. In this article, we derive natural gradients in a slightly different manner and discuss implications for batch-mode learning and pruning, linking them to existing algorithms such as Levenberg-Marquardt optimization and optimal brain surgeon. The Fisher matrix plays an important role in all these algorithms. The second half of the article discusses a layered approximation of the Fisher matrix specific to multilayered perceptrons. Using this approximation rather than the exact Fisher matrix, we arrive at much faster "natural" learning algorithms and more robust pruning procedures.},
	author = {Heskes, T. },
	citeulike-article-id = {410992},
	comment = {Describes "natural" way of doing online learning, where distance between distributions is measured using the same metric as loss. When loss is log-loss, get natural gradient. Connections with Levenberg-Marquadt},
	issn = {0899-7667},
	journal = {Neural Computation},
	keywords = {ann, information-geometry, optimization, regularization},
	month = {April},
	number = {4},
	pages = {881--901},
	priority = {2},
	title = {On \&\#034;Natural\&\#034; Learning and Pruning in Multilayered Perceptrons},
	url = {http://www.ingentaconnect.com/content/mitpress/neco/2000/00000012/00000004/art00007},
	volume = {12},
	year = {2000}
}


@article{citeulike:410981,
	abstract = {In this paper, we present a new class of quasi-Newton methods for an effective learning in large multilayer perceptron (MLP)-networks. The algorithms introduced in this work, named LQN, utilize an iterative scheme of a generalized BFGS-type method, involving a suitable family of matrix algebras L. The main advantages of these innovative methods are based upon the fact that they have an O(nlogn) complexity per step and that they require O(n) memory allocations. Numerical experiences, performed on a set of standard benchmarks of MLP-networks, show the competitivity of the LQN methods, especially for large values of n.},
	author = {Bortoletti, A.  and Di Fiore, C.  and Fanelli, S.  and Zellini, P. },
	citeulike-article-id = {410981},
	comment = {Order n log n methods for optimization},
	journal = {Neural Networks, IEEE Transactions on},
	keywords = {optimization},
	number = {2},
	pages = {263--273},
	priority = {2},
	title = {A new class of quasi-Newtonian methods for optimal learning in MLP-networks},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1189625},
	volume = {14},
	year = {2003}
}


@article{citeulike:410978,
	abstract = {			The attractive possibility of applying layerwise block training algorithms to multilayer perceptrons MLP, which offers initial advantages in computational effort, is refined in this article by means of introducing a sensitivity correction factor in the formulation. This results in a clear performance advantage, which we verify in several applications. The reasons for this advantage are discussed and related to implicit relations with second-order techniques, natural gradient formulations through Fisher's information matrix, and sample selection. Extensions to recurrent networks and other research lines are suggested at the close of the article.},
	author = {Navia-Vazquez, A.  and Figueiras-Vidal, A. R. },
	citeulike-article-id = {410978},
	comment = {new optimization algorithm and relations to Newton's method, natural gradient},
	issn = {0899-7667},
	journal = {Neural Computation},
	keywords = {optimization},
	month = {June},
	number = {6},
	pages = {1429--1447},
	priority = {2},
	title = {Efficient Block Training of Multilayer Perceptrons},
	url = {http://www.ingentaconnect.com/content/mitpress/neco/2000/00000012/00000006/art00011},
	volume = {12},
	year = {2000}
}


@article{citeulike:410975,
	abstract = {In this paper, we continue our study of learning the kernel. We present a reformulation of
this problem within a feature space environment. This leads us to study regularization in the
dual space of all continuous functions on a compact domain with values in a Hilbert space with
a mix norm. We also relate this problem in a special case to \^{o} regularization.},
	author = {Micchelli},
	citeulike-article-id = {410975},
	keywords = {regularization},
	priority = {2},
	title = {Feature space perspectives for learning the kernel}
}


@misc{citeulike:410963,
	abstract = {In this paper we propose a Bayesian, information theoretic approach to
dimensionality reduction. The approach is formulated as a variational principle
on mutual information, and seamlessly addresses the notions of sufficiency,
relevance, and representation. Maximally informative statistics are shown to
minimize a Kullback-Leibler distance between posterior distributions.
Illustrating the approach, we derive the maximally informative one dimensional
statistic for a random sample from the Cauchy distribution.},
	author = {Wolf, David  R.  and George, Edward  I. },
	citeulike-article-id = {410963},
	comment = {finds statistic k(.) which minimizes KL-divergence between p(f|data) and p(f|k(data))},
	eprint = {physics/0010039},
	keywords = {bayesian, feature-selection},
	month = {Oct},
	priority = {2},
	title = {Maximally Informative Statistics},
	url = {http://arxiv.org/abs/physics/0010039},
	year = {2000}
}


@article{citeulike:410945,
	abstract = {A statistical theory for overtraining is proposed. The analysis treats general realizable stochastic neural networks, trained with Kullback-Leibler divergence in the asymptotic case of a large number of training examples. It is shown that the asymptotic gain in the generalization error is small if we perform early stopping, even if we have access to the optimal stopping time. Based on the cross-validation stopping we consider the ratio the examples should be divided into training and cross-validation sets in order to obtain the optimum performance. Although cross-validated early stopping is useless in the asymptotic region, it surely decreases the generalization error in the nonasymptotic region. Our large scale simulations done on a CM5 are in good agreement with our analytical findings},
	author = {Amari, S.  and Murata, N.  and Muller, K. R.  and Finke, M.  and Yang, H. H. },
	citeulike-article-id = {410945},
	comment = {finds optimum train/validation split size},
	journal = {Neural Networks, IEEE Transactions on},
	keywords = {regularization},
	number = {5},
	pages = {985--996},
	priority = {2},
	title = {Asymptotic statistical theory of overtraining and cross-validation},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=623200},
	volume = {8},
	year = {1997}
}


@unpublished{citeulike:409738,
	abstract = {Wavelets are the building blocks of wavelet transforms the same way that
the functions e
inx
are the building blocks of the ordinary Fourier transform.
But in contrast to sines and cosines, wavelets can be (or almost can be) sup-
ported on an arbitrarily small closed interval. This feature makes wavelets
a very powerful tool in dealing with phenomena that change rapidly in
time. In many statistical applications, there is a need for procedures to (i)
adapt to data and (ii) use prior information. The interface of wavelets and
the Bayesian paradigm provides a natural terrain for both of these goals.
In this chapter, the authors provide an overview of the current status of
research involving Bayesian inference in wavelet nonparametric problems.
Two applications, one in functional data analysis (FDA) and the second in
geoscience are discussed in more detail.},
	author = {Ruggeri and Vidakovic},
	citeulike-article-id = {409738},
	keywords = {approximation},
	priority = {2},
	title = {Bayesian Modelling in the wavelet domain},
	year = {2005}
}


@article{citeulike:409708,
	abstract = {			We consider on-line density estimation with a parameterized density from the exponential family. The on-line algorithm receives one example at a time and maintains a parameter that is essentially an average of the past examples. After receiving an example the algorithm incurs a loss, which is the negative log-likelihood of the example with respect to the current parameter of the algorithm. An off-line algorithm can choose the best parameter based on all the examples. We prove bounds on the additional total loss of the on-line algorithm over the total loss of the best off-line parameter. These relative loss bounds hold for an arbitrary sequence of examples. The goal is to design algorithms with the best possible relative loss bounds. We use a Bregman divergence to derive and analyze each algorithm. These divergences are relative entropies between two exponential distributions. We also use our methods to prove relative loss bounds for linear regression.},
	author = {Azoury, K. S.  and Warmuth, M. K. },
	citeulike-article-id = {409708},
	comment = {Duality between mean-field and canonical parametrizations},
	issn = {0885-6125},
	journal = {Machine Learning},
	keywords = {exponential-families},
	month = {June},
	number = {3},
	pages = {211--246},
	priority = {2},
	title = {Relative Loss Bounds for On-Line Density Estimation with the Exponential Family of Distributions},
	url = {http://www.ingentaconnect.com/content/klu/mach/2001/00000043/00000003/00333256},
	volume = {43},
	year = {2001}
}


@inproceedings{citeulike:409455,
	abstract = {This paper shows that if a large neural network is used for a pattern
classification problem, and the learning algorithm finds a network
with small weights that has small squared error on the training
patterns, then the generalization performance depends on the size
of the weights rather than the number of weights. More specifically,
consider an `-layer feed-forward network of sigmoid units, in
which the sum of the magnitudes of the weights associated with
each unit is bounded by A. The...},
	author = {Bartlett, Peter  L. },
	booktitle = {Advances in Neural Information Processing Systems},
	citeulike-article-id = {409455},
	editor = {Mozer, Michael  C.  and Jordan, Michael  I.  and Petsche, Thomas  },
	keywords = {statistical-learning-theory},
	priority = {2},
	publisher = {The MIT Press},
	title = {For Valid Generalization the Size of the Weights is More Important than the Size of the Network},
	url = {http://citeseer.ist.psu.edu/bartlett97for.html},
	volume = {9},
	year = {1997}
}


@article{citeulike:409452,
	address = {Cambridge, MA, USA},
	author = {Friedman, Nir   and Goldszmidt, Moises  },
	citeulike-article-id = {409452},
	comment = {Decision trees to parametrize local CPD's. Heckerman's MS Research group has similar work with decision graphs that represent more kinds of equality constraints on the CPD.},
	isbn = {0262600323},
	keywords = {bayesnet},
	pages = {421--459},
	priority = {2},
	publisher = {MIT Press},
	title = {Learning Bayesian networks with local structure},
	url = {http://portal.acm.org/citation.cfm?id=308574.308691},
	year = {1999}
}


@inproceedings{citeulike:409450,
	address = {Menlo Park, CA, USA},
	author = {Chan, Hei   and Darwiche, Adnan  },
	booktitle = {Eighteenth national conference on Artificial intelligence},
	citeulike-article-id = {409450},
	isbn = {0262511290},
	keywords = {approximation, bayesian},
	pages = {539--545},
	priority = {2},
	publisher = {American Association for Artificial Intelligence},
	title = {A distance measure for bounding probabilistic belief change},
	url = {http://portal.acm.org/citation.cfm?id=777092.777176},
	year = {2002}
}


@article{citeulike:409035,
	abstract = {In recent years there has been an increasing interest in learning Bayesian networks from 
data. One of the most effective methods for learning such networks is based on the minimum 
description length (MDL) principle. Previous work has shown that this learning procedure 
is asymptotically successful: with probability one, it will converge to the target distribution, 
given a sufficient number of samples. However, the rate of this convergence has been hitherto 
unknown. 
In this work we examine the sample complexity of MDL based learning procedures for 
Bayesian networks. We show that the number of samples needed to learn an ffl­close approxi­ 
mation (in terms of entropy distance) with confidence ffi is O 
i 
( 1 
ffl 
) 4 
3 log 1 
ffl 
log 1 
ffi 
log log 1 
ffi 
j 
. This 
means that the sample complexity is a low­order polynomial in the error threshold and sub­ 
linear in the confidence bound. We also discuss how the constants in this term depend on the 
complexity of the target distribution. Finally, we address questions of asymptotic minimality 
and propose a method for using the sample complexity results to speed up the learning process.},
	author = {Friedman},
	citeulike-article-id = {409035},
	comment = {Bounding KL dispersion (Sanov's theorem), bounds in terms of skewness of true distribution},
	keywords = {statistical-learning-theory},
	priority = {2},
	title = {On the Sample Complexity of Learning Bayesian Networks},
	year = {1996}
}


@techreport{citeulike:409019,
	abstract = {Regularization in linear modeling is viewed as a twostage process. First a set of can-
didate models is dened by a path through the space of joint parameter values, and then
a point on this path is chosen to be the nal model. Various pathnding strategies for
the rst stage of this process are examined, based on the notion of generalized gradient
descent. Several of these strategies are seen to produce paths that closely correspond to
those induced by commonly used penalization methods. Others give rise to new regular-
ization techniques that are shown to be advantageous in some situations. In all cases, the
gradient descent pathnding paradigm can be readily generalized to include the use of a
wide variety of loss criteria, leading to robust methods for regression and classication, as
well as to apply user dened constraints on the parameter values, all with highly e¢ cient
computational implementations.},
	author = {Friedman},
	citeulike-article-id = {409019},
	comment = {Truncated gradient which subsumes lasso and ridge regression as special cases. Link between early stopping in gradient descent and penalized ML.},
	keywords = {optimization, regularization},
	priority = {2},
	title = {Gradient directed regularization for linear regression and classification},
	year = {2004}
}


@inproceedings{citeulike:407793,
	abstract = {Bayesian predictions are stochastic just like predictions of any other 
inference scheme that generalize from a finite sample. While a sim­ple variational argument shows that Bayes averaging is generaliza­tion optimal given that the prior matches the teacher parameter 
distribution the situation is less clear if the teacher distribution is 
unknown. I define a class of averaging procedures, the temperated 
likelihoods, including both Bayes averaging with a uniform prior 
and maximum likelihood estimation as special cases. I show that 
Bayes is generalization optimal in this family for any teacher dis­tribution for two learning problems that are analytically tractable: 
learning the mean of a Gaussian and asymptotics of smooth learn­ers.},
	author = {Hansen},
	citeulike-article-id = {407793},
	comment = {Nice bias-variance decomposition graph. Shows that optimal value is independendent of true teacher distribution.},
	keywords = {bayesian},
	priority = {2},
	title = {Bayesian Averaging is Well-Temperated},
	year = {1999}
}


@mastersthesis{citeulike:407792,
	author = {Fabricius},
	citeulike-article-id = {407792},
	comment = {temperated learning
},
	keywords = {bayesian, model-selection},
	priority = {2},
	title = {Bayesian Model Selection}
}


@article{bjm-ccrb-05,
	abstract = {Many of the classification algorithms developed in the machine learning literature, including the support vector machine and boosting, can be viewed as minimum contrast methods that minimize a convex surrogate of the 0-1 loss function. The convexity makes these algorithms computationally efficient. The use of a surrogate, however, has statistical consequences that must be balanced against the computational virtues of convexity. To study these issues, we provide a general quantitative relationship between the risk as assessed using the 0-1 loss and the risk as assessed using any nonnegative surrogate loss function. We show that this relationship gives nontrivial upper bounds on excess risk under the weakest possible condition on the loss function: that it satisfy a pointwise form of Fisher consistency for classification. The relationship is based on a simple variational transformation of the loss function that is easy to compute in many applications. We also present a refined version of this result in the case of low noise. Finally, we present applications of our results to the estimation of convergence rates in the general setting of function classes that are scaled convex hulls of a finite-dimensional base class, with a variety of commonly used loss functions.},
	author = {Bartlett and Jordan and Mcauliffe},
	citeulike-article-id = {407786},
	journal = {Journal of the American Statistical Association},
	keywords = {statistical-learning-theory},
	priority = {2},
	title = {Convexity, classification, and risk bounds},
	year = {2005}
}


@article{citeulike:407783,
	abstract = {In statistical learning by information systems, operations are required for selecting the model that best fits the given criterion from among multiple model candidates. The types of criteria that can be used include a consistency criterion for detecting the true distribution according to a maximum probability and an efficiency criterion for minimizing the prediction error. Although criteria such as the AIC or BIC have been proposed and their characteristics have been studied in detail for a regular statistical model, there are many unknowns concerning criteria for selecting a learning model having singularities such as a neural network or mixed normal distribution. In this paper, the authors examine Bayesian learning by models having singularities and compare a method that always uses a positive prior distribution and a method that uses Jeffreys' prior distribution from viewpoints related to consistency and efficiency when selecting a model for minimizing the stochastic complexity. Based on a theoretical proposition that is already known, they rationally predict the difference between the two distributions and verify that prediction experimentally. In particular, they clearly demonstrate experimentally that when the family of learning models includes the true distribution, Jeffreys' prior distribution is superior in terms of both consistency and efficiency, but when the family of learning models does not include the true distribution, the prior distribution that always takes positive values is superior in terms of efficiency. \&copy; 2005 Wiley Periodicals, Inc. Electron Comm Jpn Pt 2, 88(2): 47-58, 2005; Published online in Wiley InterScience (<A
HREF="http://www.interscience.wiley.com"
TARGET="\_top"
>www.interscience.wiley.com</A
>). DOI 10.1002/ecjb.20147},
	author = {Nishiue, Koichiro   and Watanabe, Sumio  },
	citeulike-article-id = {407783},
	doi = {10.1002/ecjb.20147},
	issn = {1520-6432},
	journal = {Electronics and Communications in Japan (Part II: Electronics)},
	keywords = {bayesian, prior},
	month = {January},
	number = {2},
	pages = {47--58},
	priority = {2},
	title = {Effects of priors in model selection problem of learning machines with singularities},
	url = {http://dx.doi.org/10.1002/ecjb.20147},
	volume = {88},
	year = {2005}
}


@article{citeulike:407782,
	abstract = {There has been a recent growth in the use of Bayesian methods in medical research. The main reasons for this are the development of computer intensive simulation based methods such as Markov chain Monte Carlo (MCMC), increases in computing power and the introduction of powerful software such as WinBUGS. This has enabled increasingly complex models to be fitted. The ability to fit these complex models has led to MCMC methods being used as a convenient tool by frequentists, who may have no desire to be fully Bayesian.</TD
></TR
><TR
><TD
>Often researchers want <IMG
SRC="/giflibrary/12/lsquo.gif"
BORDER="0">the data to dominate<IMG
SRC="/giflibrary/12/rsquo.gif"
BORDER="0"> when there is no prior information and thus attempt to use vague prior distributions. However, with small amounts of data the use of vague priors can be problematic. The results are potentially sensitive to the choice of prior distribution. In general there are fewer problems with location parameters. The main problem is with scale parameters. With scale parameters, not only does one have to decide the distributional form of the prior distribution, but also whether to put the prior distribution on the variance, standard deviation or precision.</TD
></TR
><TR
><TD
>We have conducted a simulation study comparing the effects of 13 different prior distributions for the scale parameter on simulated random effects meta-analysis data. We varied the number of studies (5, 10 and 30) and compared three different between-study variances to give nine different simulation scenarios. One thousand data sets were generated for each scenario and each data set was analysed using the 13 different prior distributions. The frequentist properties of bias and coverage were investigated for the between-study variance and the effect size.</TD
></TR
><TR
><TD
>The choice of prior distribution was crucial when there were just five studies. There was a large variation in the estimates of the between-study variance for the 13 different prior distributions. With a large number of studies the choice of prior distribution was less important. The effect size estimated was not biased, but the precision with which it was estimated varied with the choice of prior distribution leading to varying coverage intervals and, potentially, to different statistical inferences. Again there was less of a problem with a larger number of studies. There is a particular problem if the between-study variance is close to the boundary at zero, as MCMC results tend to produce upwardly biased estimates of the between-study variance, particularly if inferences are based on the posterior mean.</TD
></TR
><TR
><TD
>The choice of <IMG
SRC="/giflibrary/12/lsquo.gif"
BORDER="0">vague<IMG
SRC="/giflibrary/12/rsquo.gif"
BORDER="0"> prior distribution can lead to a marked variation in results, particularly in small studies. Sensitivity to the choice of prior distribution should always be assessed. Copyright \&copy; 2005 John Wiley \&amp; Sons, Ltd.},
	author = {Lambert, Paul  C.  and Sutton, Alex  J.  and Burton, Paul  R.  and Abrams, Keith  R.  and Jones, David  R. },
	citeulike-article-id = {407782},
	doi = {10.1002/sim.2112},
	issn = {1097-0258},
	journal = {Statistics in Medicine},
	keywords = {bayesian, prior},
	month = {July},
	number = {15},
	pages = {2401--2428},
	priority = {2},
	title = {How vague is vague? A simulation study of the impact of the use of vague prior distributions in MCMC using WinBUGS},
	url = {http://dx.doi.org/10.1002/sim.2112},
	volume = {24},
	year = {2005}
}


@article{citeulike:407691,
	author = {Haughton, Dominique  M. A. },
	citeulike-article-id = {407691},
	journal = {The Annals of Statistics},
	keywords = {model-selection},
	number = {1},
	pages = {342--355},
	priority = {2},
	title = {On the Choice of a Model to Fit Data from an Exponential Family},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28198803\%2916\%3A1\%3C342\%3AOTCOAM\%3E2.0.CO\%3B2-K},
	volume = {16},
	year = {1988}
}


@article{citeulike:406720,
	abstract = {Conditional maximum entropy (ME) models provide
a general purpose machine learning technique
which has been successfully applied to fields as
diverse as computer vision and econometrics, and
which is used for a wide variety of classification
problems in natural language processing. However,
the flexibility of ME models is not without cost.
While parameter estimation for ME models is conceptually
straightforward, in practice ME models
for typical natural language tasks are very large, and
may well contain many thousands of free parameters.
In this paper, we consider a number of algorithms
for estimating the parameters of ME models,
including iterative scaling, gradient ascent, conjugate
gradient, and variable metric methods. Surprisingly,
the standardly used iterative scaling algorithms
perform quite poorly in comparison to the
others, and for all of the test problems, a limitedmemory
variable metric algorithm outperformed the
other choices},
	author = {Malouf},
	citeulike-article-id = {406720},
	keywords = {maxent, optimization},
	priority = {2},
	title = {A comparison of algorithms for maximum entropy parameter estimation}
}


@inproceedings{citeulike:406642,
	abstract = {We present an analysis of how the generalization performance (expected 
test set error) relates to the expected training set error for nonlinear learn­ 
ing systems, such as multilayer perceptrons and radial basis functions. The 
principal result is the following relationship (computed to second order) 
between the expected test set and training set errors: 
hE test (\$(H)i(B ¸¸ 0 {\ss} hE train (\$(H)i(B ¸ + 2oe 2 
eff 
p eff (\$(H) 
(Bn 
: (1) 
Here, n is the size of the training sample ¸, oe 2 
eff is the effective noise 
variance in the response variable(s), \$(H i(Bs a regularization or weight decay 
parameter, and p eff (\$(H) (Bis the effective number of parameters in the non­ 
linear model. The expectations h i of training set and test set errors are 
taken over possible training sets ¸ and training and test sets ¸ 0 respec­ 
tively. The effective number of parameters p eff (\$(H) (Busually differs from the 
true number of model parameters p for nonlinear or regularized models; 
this theoretical conclusion is supported by Monte Carlo experiments. In 
addition to the surprising result that p eff (\$(H) (B6= p, we propose an estimate 
of (1) called the generalized prediction error (GPE) which generalizes well 
established estimates of prediction risk such as Akaike's FPE and AIC, 
Mallows CP , and Barron's PSE to the nonlinear setting.},
	author = {Moody},
	citeulike-article-id = {406642},
	comment = {Seems related to Murata's work
Says AIC is only appropriate for linear model selection},
	journal = {NIPS},
	keywords = {model-selection, regularization},
	priority = {2},
	title = {The effective number of parameters: An analysis of generalization
and regularization in. non-linear learning systems},
	year = {1991}
}


@unpublished{citeulike:405111,
	author = {Shewchuck},
	citeulike-article-id = {405111},
	keywords = {optimization},
	priority = {2},
	title = {An Introduction to the Conjugate Gradient Method Without the Agonizing Pain},
	year = {1994}
}


@article{citeulike:405101,
	abstract = {We provide an alternative to the maximum likelihood method for making inferences about the parameters of the logistic regression model. The method is based appropriate permutational distributions of sufficient statistics. It is useful for analysing small or unbalanced binary data with covariates. It also applies to small-sample clustered binary data. We illustrate the method by analysing several biomedical data sets.},
	address = {Department of Biostatistics, Harvard School of Public Health, USA.},
	author = {Mehta, C. R.  and Patel, N. R. },
	citeulike-article-id = {405101},
	comment = {an alternative to maximum likelihood},
	issn = {0277-6715},
	journal = {Stat Med},
	keywords = {estimation},
	month = {October},
	number = {19},
	pages = {2143--2160},
	priority = {2},
	title = {Exact logistic regression: theory and examples.},
	url = {http://view.ncbi.nlm.nih.gov/pubmed/8552893},
	volume = {14},
	year = {1995}
}


@article{citeulike:404970,
	abstract = {similarity to Rissannen's work},
	author = {Fossgaard},
	citeulike-article-id = {404970},
	keywords = {information-geometry},
	priority = {2},
	title = {An invariant bayesian model selection
principle for gaussian data in a sparse
representation}
}


@article{citeulike:404842,
	address = {Cambridge, MA, USA},
	author = {Perkins, Simon   and Lacker, Kevin   and Theiler, James  },
	citeulike-article-id = {404842},
	comment = {Coordinate-wise descent faster than full gradient},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {optimization},
	pages = {1333--1356},
	priority = {2},
	publisher = {MIT Press},
	title = {Grafting: fast, incremental feature selection by gradient descent in function space},
	url = {http://portal.acm.org/citation.cfm?id=944976},
	volume = {3},
	year = {2003}
}


@article{citeulike:404207,
	abstract = {Investigators interested in model order estimation have tended to divide themselves into
widely separated camps; this survey of the contributions of Schwarz, Wallace, Rissanen, and
their coworkers attempts to build bridges between the various viewpoints, illuminating connections
which may have previously gone unnoticed and clarifying misconceptions which seem
to have propagated in the applied literature. Our tour begins with Schwarz’s approximation
of Bayesian integrals via Laplace’s method. We then introduce the concepts underlying Rissanen’s
minimum description length principle via a Bayesian scenario with a known prior;
this provides the groundwork for understanding his more complex non-Bayesian MDL which
employs a “universal” encoding of the integers. Rissanen’s method of parameter truncation is
contrasted with that employed in various versions of Wallace’s minimum message length criteria.
Rissanen’s more recent notion of stochastic complexity is outlined in terms of Bernardo’s
information-theoretic derivation of the Jeffreys prior.},
	author = {Lanterman},
	citeulike-article-id = {404207},
	keywords = {information-geometry, model-selection, toread},
	priority = {2},
	title = {Schwarz, Wallace, and Rissanen: Intertwining themes in theories of model selection},
	year = {2000}
}


@article{citeulike:403898,
	author = {Fisher, R. A. },
	citeulike-article-id = {403898},
	comment = {Mentions parametrization invariance principle},
	keywords = {philosophy, statistics},
	priority = {2},
	title = {On the Mathematical Foundations of Theoretical Statistics},
	url = {http://links.jstor.org/sici?sici=0264-3952\%281922\%29222\%3C309\%3AOTMFOT\%3E2.0.CO\%3B2-2}
}


@article{citeulike:403893,
	abstract = {The concepts of uncertainty in prediction and inference are introduced
and illustrated using the diffraction of light as an example. The close relationship
between the concepts of uncertainty in inference and resolving
power is noted. A general quantitative measure of uncertainty in inference
can be obtained by means of the so-called statistical distance between
probability distributions. When applied to quantum mechanics, this distance
leads to a measure of the distinguishability of quantum states, which
essentially is the absolute value of the matrix element between the states.
The importance of this result to the quantum mechanical uncertainty principle
is noted. The second part of the paper provides a derivation of the
statistical distance on basis of the so-called method of support.},
	author = {Hilgevoord and Uffink},
	citeulike-article-id = {403893},
	comment = {Fisher Information and resolvability},
	keywords = {philosophy, toread},
	priority = {2},
	title = {Uncertainty in prediction and in inference}
}


@article{citeulike:403859,
	address = {Cambridge, MA, USA},
	author = {Rosset, Saharon   and Zhu, Ji   and Hastie, Trevor  },
	citeulike-article-id = {403859},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {regularization},
	pages = {941--973},
	priority = {2},
	publisher = {MIT Press},
	title = {Boosting as a Regularized Path to a Maximum Margin Classifier},
	url = {http://portal.acm.org/citation.cfm?id=1005332.1016790},
	volume = {5},
	year = {2004}
}


@article{citeulike:403844,
	abstract = {We consider the generic regularized optimization problem ˆ¯(¸) = argmin¯ L(y,X¯) + ¸J(¯). Recently, Efron et al. (2004) have shown that for the Lasso – that is, if L is squared error loss and J(¯) = k¯k1 is the l1 norm of ¯ – the optimal coefficient path is piecewise linear, i.e., @ ˆ¯(¸)/@¸ is piecewise constant. We derive a general characterization of the properties of (loss L, penalty J) pairs which give piecewise linear coefficient paths. Such pairs allow for efficient generation of the full regularized coefficient paths. We investigate the nature of efficient path following algorithms which arise. We use our results to suggest robust versions of the Lasso for regression and classification, and to develop new, efficient algorithms for existing problems in the literature, including Mammen \& van de Geer’s Locally Adaptive Regression Splines.},
	author = {Rosset, Saharon   and Zhu, Ji  },
	citeulike-article-id = {403844},
	keywords = {regularization},
	priority = {2},
	title = {Piecewise Linear Regularized Solution Paths},
	year = {2004}
}


@article{citeulike:403839,
	abstract = {How should we decide among competing explanations (models) of a cognitive phenomenon? This problem of
model selection is at the heart of the scienti¯c enterprise. Ideally, we would like to identify the model that
actually generated the data at hand. However, this is an un-achievable goal as it is fundamentally ill-posed.
Information in a ¯nite data sample is seldom su±cient to point to a single model. Multiple models may provide
equally good descriptions of the data, a problem that is exacerbated by the presence of random error in the data.
In fact, model selection bears a striking similarity to perception, in that both require solving an inverse problem.
Just as perceptual ambiguity can be addressed only by introducing external constraints on the interpretation
of visual images, the ill-posedness of the model selection problem requires us to introduce external constraints
on the choice of the most appropriate model. Model selection methods di®er in how these external constraints
are conceptualized and formalized. In this review we discuss the development of the various approaches, the
di®erences between them, and why the methods perform as they do. An application example of selection methods
in cognitive modeling is also discussed.},
	author = {Myung and Pitt and Navarro},
	citeulike-article-id = {403839},
	keywords = {information-geometry, model-selection},
	priority = {2},
	title = {Model selection in cognitive science as an inverse problem}
}


@article{citeulike:402330,
	author = {Mitchell},
	citeulike-article-id = {402330},
	keywords = {generative-discriminative},
	priority = {2},
	title = {GENERATIVE AND DISCRIMINATIVE CLASSIFIERS: NAIVE BAYES AND LOGISTIC REGRESSION}
}


@inproceedings{citeulike:402326,
	abstract = {We propose the use of learning algorithm profiles to address the model selection problem in knowledge discovery systems. These profiles consist of metalevel feature-value vectors which describe learning algorithms from the point of view of their representation and functionality, efficiency, robustness and practicality. Values for these features are assigned on the basis of author specifications, expert consensus or previous empirical studies. We review past evaluations of the better known learning algorithms and suggest an experimental strategy for building algorithm profiles on more quantitative grounds. Preliminary experiments have disconfirmed expert judgments on certain algorithm features, thus showing the need to build and refine such profiles via controlled experiments},
	author = {Hilario, M.  and Kalousis, A. },
	citeulike-article-id = {402326},
	comment = {Naive Bayes grows linearly with irrelevant features},
	journal = {Systems, Man, and Cybernetics, 1999. IEEE SMC '99 Conference Proceedings. 1999 IEEE International Conference on},
	keywords = {feature-selection},
	pages = {956--961 vol.3},
	priority = {2},
	title = {Building algorithm profiles for prior model selection in knowledge discovery systems},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=823357},
	volume = {3},
	year = {1999}
}


@article{citeulike:402319,
	abstract = {			Variable and feature selection have become the focus of much research in areas of application for which datasets with tens or hundreds of thousands of variables are available. These areas include text processing of internet documents, gene expression array analysis, and combinatorial chemistry. The objective of variable selection is three-fold: improving the prediction performance of the predictors, providing faster and more cost-effective predictors, and providing a better understanding of the underlying process that generated the data. The contributions of this special issue cover a wide range of aspects of such problems: providing a better definition of the objective function, feature construction, feature ranking, multivariate feature selection, efficient search methods, and feature validity assessment methods.},
	author = {Guyon, I.  and Elisseeff, A. },
	citeulike-article-id = {402319},
	doi = {10.1162/153244303322753616},
	issn = {1532-4435},
	keywords = {feature-selection},
	priority = {2},
	title = {An Introduction to Variable and Feature Selection},
	url = {http://dx.doi.org/10.1162/153244303322753616}
}


@article{citeulike:402317,
	abstract = {The moment-entropy inequality shows that a continuous random variable with given second moment and maximal Shannon entropy must be Gaussian. Stam's inequality shows that a continuous random variable with given Fisher information and minimal Shannon entropy must also be Gaussian. The Crame/spl acute/r-Rao inequality is a direct consequence of these two inequalities. In this paper, the inequalities above are extended to Renyi entropy, p/sup th/ moment, and generalized Fisher information. Generalized Gaussian random densities are introduced and shown to be the extremal densities for the new inequalities. An extension of the Crame/spl acute/r-Rao inequality is derived as a consequence of these moment and Fisher information inequalities.},
	author = {Lutwak, E.  and Yang, Deane   and Zhang, Gaoyong  },
	citeulike-article-id = {402317},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {maxent},
	number = {2},
	pages = {473--478},
	priority = {2},
	title = {Crame/spl acute/r-Rao and moment-entropy inequalities for Renyi entropy and generalized Fisher information},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1386522},
	volume = {51},
	year = {2005}
}


@article{citeulike:402254,
	abstract = {			In this survey, we review work in machine learning on methods for handling data sets containing large amounts of irrelevant information. We focus on two key issues: the problem of selecting relevant features, and the problem of selecting relevant examples. We describe the advances that have been made on these topics in both empirical and theoretical work in machine learning, and we present a general framework that we use to compare different methods. We close with some challenges for future work in this area.},
	author = {Blum, A. L.  and Langley, P. },
	citeulike-article-id = {402254},
	doi = {10.1016/S0004-3702(97)00063-5},
	issn = {0004-3702},
	journal = {Artificial Intelligence},
	keywords = {statistical-learning-theory},
	month = {December},
	number = {1},
	pages = {245--271},
	priority = {2},
	title = {Selection of relevant features and examples in machine learning},
	url = {http://portal.acm.org/citation.cfm?id=270626},
	volume = {97},
	year = {1997}
}


@article{citeulike:402245,
	abstract = {We present an approach to bounded constraintrelaxation
for entropy maximization that corresponds
to using a double-exponential prior or `1 regularizer
in likelihood maximization for log-linear
models. We show that a combined incremental feature
selection and regularization method can be established
for maximum entropy modeling by a natural
incorporation of the regularizer into gradientbased
feature selection, following Perkins et al.
(2003). This provides an efficient alternative to standard
`1 regularization on the full feature set, and
a mathematical justification for thresholding techniques
used in likelihood-based feature selection.
Also, we motivate an extension to n-best feature
selection for linguistic features sets with moderate
redundancy, and present experimental results showing
its advantage over `0, 1-best `1, `2 regularization
and over standard incremental feature selection for
the task of maximum-entropy parsing.1},
	author = {Riezler and Vasserman},
	citeulike-article-id = {402245},
	keywords = {maxent, regularization},
	priority = {2},
	title = {Incremental Feature Selection and Regularization for Relaxed Maximum-Entropy Modeling}
}


@inproceedings{citeulike:402212,
	abstract = {In supervised learning, the regularization method is often used for improving the level of generalization. We give a necessary and sufficient condition of an optimal regularization term, i.e., a regularization operator and parameter. The optimality is discussed based on the projection learning criterion in which the minimization of a generalization error is explicitly considered. We suggest how to design the optimal regularization term so as to satisfy the obtained condition},
	author = {Nakashima, A.  and Ogawa, H. },
	citeulike-article-id = {402212},
	comment = {Same line of work as Sugiyama's SIC},
	journal = {Neural Information Processing, 1999. Proceedings. ICONIP '99. 6th International Conference on},
	keywords = {regularization},
	pages = {222--227 vol.1},
	priority = {2},
	title = {How to design a regularization term for improving generalization},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=843990},
	volume = {1},
	year = {1999}
}


@article{citeulike:402208,
	abstract = {			The problem of designing the regularization term and regularization parameter for linear regression models is discussed. Previously, we derived an approximation to the generalization error called the subspace information criterion (SIC), which is an unbiased estimator of the generalization error with finite samples under certain conditions. In this paper, we apply SIC to regularization learning and use it for: (a) choosing the optimal regularization term and regularization parameter from the given candidates; (b) obtaining the closed form of the optimal regularization parameter for a fixed regularization term. The effectiveness of SIC is demonstrated through computer simulations with artificial and real data.},
	author = {Sugiyama, M.  and Ogawa, H. },
	citeulike-article-id = {402208},
	doi = {10.1016/S0893-6080(02)00022-9},
	issn = {0893-6080},
	journal = {Neural Networks},
	keywords = {regularization},
	month = {April},
	number = {3},
	pages = {349--361},
	priority = {2},
	title = {Optimal design of regularization term and regularization parameter by subspace information criterion},
	url = {http://dx.doi.org/10.1016/S0893-6080(02)00022-9},
	volume = {15},
	year = {2002}
}


@inproceedings{citeulike:402207,
	address = {New York, NY, USA},
	author = {Ng, Andrew  Y. },
	booktitle = {ICML '04: Proceedings of the twenty-first international conference on Machine learning},
	citeulike-article-id = {402207},
	doi = {10.1145/1015330.1015435},
	isbn = {1581138285},
	keywords = {regularization},
	priority = {2},
	publisher = {ACM Press},
	title = {Feature selection, L1 vs. L2 regularization, and rotational invariance},
	url = {http://portal.acm.org/citation.cfm?id=1015435},
	year = {2004}
}


@misc{citeulike:402205,
	abstract = {This paper discusses the application of L1-regularized maximum entropy
modeling or SL1-Max [9] to multiclass categorization problems. A new
modification to the SL1-Max fast sequential learning algorithm is proposed to
handle conditional distributions. Furthermore, unlike most previous studies,
the present research goes beyond a single type of conditional distribution. It
describes and compares a variety of modeling assumptions about the class
distribution (independent or exclusive) and various types of joint or
conditional distributions. It results in a new methodology for combining binary
regularized classifiers to achieve multiclass categorization. In this context,
Maximum Entropy can be considered as a generic and efficient regularized
classification tool that matches or outperforms the state-of-the art
represented by AdaBoost and SVMs.},
	author = {Haffner, Patrick   and Phillips, Steven   and Schapire, Rob  },
	citeulike-article-id = {402205},
	eprint = {cs.LG/0506101},
	keywords = {maxent},
	month = {Jun},
	priority = {2},
	title = {Efficient Multiclass Implementations of L1-Regularized Maximum Entropy},
	url = {http://arxiv.org/abs/cs.LG/0506101},
	year = {2005}
}


@unpublished{citeulike:392175,
	author = {Ruymgaart},
	citeulike-article-id = {392175},
	keywords = {estimation, notes},
	priority = {2},
	title = {A short introduction to inverse statistical inference}
}


@article{citeulike:392171,
	abstract = {Standard statistical practice ignores model uncertainty. Data analysts typically select a model from some class of models and then proceed as if the selected model had generated the data. This approach ignores the uncertainty in model selection, leading to over-confident inferences and decisions that are more risky than one thinks they are. Bayesian model averaging (BMA) provides a coherent mechanism for accounting for this model uncertainty. Several methods for implementing BMA have recently emerged. We discuss these methods and present a number of examples. In these examples, BMA provides improved out-of-sample predictive performance. We also provide a catalogue of currently available BMA software.},
	author = {Hoeting, Jennifer  A.  and Madigan, David   and Raftery, Adrian  E.  and Volinsky, Chris  T. },
	citeulike-article-id = {392171},
	journal = {Statistical Science},
	keywords = {bayesian, notes},
	number = {4},
	pages = {382--401},
	priority = {2},
	title = {Bayesian Model Averaging: A Tutorial},
	url = {http://links.jstor.org/sici?sici=0883-4237\%28199911\%2914\%3A4\%3C382\%3ABMAAT\%3E2.0.CO\%3B2-7},
	volume = {14},
	year = {1999}
}


@article{citeulike:392165,
	abstract = {			Least squares support vector machines (LS-SVM) is an SVM version which involves equality instead of inequality constraints and works with a least squares cost function. In this way, the solution follows from a linear Karush-Kuhn-Tucker system instead of a quadratic programming problem. However, sparseness is lost in the LS-SVM case and the estimation of the support values is only optimal in the case of a Gaussian distribution of the error variables. In this paper, we discuss a method which can overcome these two drawbacks. We show how to obtain robust estimates for regression by applying a weighted version of LS-SVM. We also discuss a sparse approximation procedure for weighted and unweighted LS-SVM. It is basically a pruning method which is able to do pruning based upon the physical meaning of the sorted support values, while pruning procedures for classical multilayer perceptrons require the computation of a Hessian matrix or its inverse. The methods of this paper are illustrated for RBF kernels and demonstrate how to obtain robust estimates with selection of an appropriate number of hidden units, in the case of outliers or non-Gaussian error distributions with heavy tails.},
	author = {Suykens, J. A. K.  and De Brabanter, J.  and Lukas, L.  and Vandewalle, J. },
	citeulike-article-id = {392165},
	doi = {10.1016/S0925-2312(01)00644-0},
	issn = {0925-2312},
	journal = {Neurocomputing},
	keywords = {svm},
	month = {October},
	number = {1},
	pages = {85--105},
	priority = {2},
	title = {Weighted least squares support vector machines: robustness and sparse approximation},
	url = {http://dx.doi.org/10.1016/S0925-2312(01)00644-0},
	volume = {48},
	year = {2002}
}


@article{citeulike:392022,
	abstract = {I propose a general framework for approximating Bayesian belief networks through model simplification by arc removal. Given an upper bound on the absolute error allowed on the prior and posterior probability distributions of the approximated network, a subset of arcs is removed, thereby speeding up probabilistic inference},
	author = {van Engelen, R. A. },
	citeulike-article-id = {392022},
	journal = {Pattern Analysis and Machine Intelligence, IEEE Transactions on},
	keywords = {approximation},
	number = {8},
	pages = {916--920},
	priority = {2},
	title = {Approximating Bayesian belief networks by arc removal},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=608295},
	volume = {19},
	year = {1997}
}


@inproceedings{citeulike:392020,
	author = {Choi},
	citeulike-article-id = {392020},
	keywords = {approximation},
	priority = {2},
	title = {On Bayesian Network Approximation by Edge Deletion},
	year = {2005}
}


@incollection{MR1421373,
	address = {Providence, RI},
	author = {Wo{\'z}niakowski, Henryk  },
	booktitle = {The mathematics of numerical analysis (Park City, UT, 1995)},
	citeulike-article-id = {392017},
	keywords = {computational-complexity},
	pages = {915--927},
	priority = {2},
	publisher = {Amer. Math. Soc.},
	series = {Lectures in Appl. Math.},
	title = {Overview of information-based complexity},
	volume = {32},
	year = {1996}
}


@misc{citeulike:391264,
	abstract = {In this expository paper we illustrate the generality of game theoretic
probability protocols of Shafer and Vovk (2001) in finite-horizon discrete
games. By restricting ourselves to finite-horizon discrete games, we can
explicitly describe how discrete distributions with finite support and the
discrete pricing formulas, such as the Cox-Ross-Rubinstein formula, are
naturally derived from game-theoretic probability protocols. Corresponding to
any discrete distribution with finite support, we construct a finite-horizon
discrete game, a replicating strategy of Skeptic, and a neutral forecasting
strategy of Forecaster, such that the discrete distribution is derived from the
game. Construction of a replicating strategy is the same as in the standard
arbitrage arguments of pricing European options in the binomial tree models.
However the game theoretic framework is advantageous because no a priori
probabilistic assumption is needed.},
	author = {Takemura, Akimichi   and Suzuki, Taiji  },
	citeulike-article-id = {391264},
	eprint = {math.PR/0509367},
	keywords = {game-theory},
	month = {Sep},
	priority = {2},
	title = {Game theoretic derivation of discrete distributions and discrete pricing formulas},
	url = {http://arxiv.org/abs/math.PR/0509367},
	year = {2005}
}


@article{citeulike:390815,
	address = {New York, NY, USA},
	author = {Hut, Piet   and Ruelle, David   and Traub, Joseph  },
	citeulike-article-id = {390815},
	doi = {10.1002/(SICI)1099-0526(199807},
	issn = {1076-2787},
	journal = {Complex.},
	keywords = {computational-complexity},
	month = {July},
	number = {6},
	pages = {33--38},
	priority = {2},
	publisher = {John Wiley \& Sons, Inc.},
	title = {Varieties of limits to scientific knowledge},
	url = {http://portal.acm.org/citation.cfm?id=295859.295864},
	volume = {3},
	year = {1998}
}


@book{citeulike:390813,
	author = {Traub, Joe  F. },
	citeulike-article-id = {390813},
	comment = {Develops theory of computational complexity for numerical (approximate) algorithms},
	howpublished = {Hardcover},
	isbn = {0201078902},
	keywords = {computational-complexity},
	priority = {2},
	publisher = {Addison-Wesley},
	title = {Information, Uncertainty, Complexity},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0201078902}
}


@inproceedings{citeulike:389128,
	abstract = {The paper provides a new viewpoint on regularization theory from different perspectives. It is shown that the regularized solution can be derived from the Fourier transformation operator in the transformation domain and with equivalent form from the linear differential operator in the spatial domain. The state-of-the-art research in regularization is briefly reviewed with extended discussions on Occam's razor, minimum length description, Bayesian framework, pruning algorithms, statistical learning theory, and equivalent regularization},
	author = {Chen, Zhe   and Haykin, S. },
	citeulike-article-id = {389128},
	journal = {Systems, Man, and Cybernetics, 2001 IEEE International Conference on},
	keywords = {regularization},
	pages = {1642--1647 vol.3},
	priority = {2},
	title = {A new view on regularization theory},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=973520},
	volume = {3},
	year = {2001}
}


@article{citeulike:389122,
	abstract = {Generalization bounds depending on the margin of a classifier are a relatively new development. They provide an explanation of the performance of state-of-the-art learning systems such as support vector machines (SVMs) and Adaboost. The difficulty with these bounds has been either their lack of robustness or their looseness. The question of whether the generalization of a classifier can be more tightly bounded in terms of a robust measure of the distribution of margin values has remained open for some time. The paper answers this open question in the affirmative and, furthermore, the analysis leads to bounds that motivate the previously heuristic soft margin SVM algorithms as well as justifying the use of the quadratic loss in neural network training algorithms. The results are extended to give bounds for the probability of failing to achieve a target accuracy in regression prediction, with a statistical analysis of ridge regression and Gaussian processes as a special case. The analysis presented in the paper has also lead to new boosting algorithms described elsewhere.},
	author = {Shawe-Taylor, J.  and Cristianini, N. },
	citeulike-article-id = {389122},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {regularization},
	number = {10},
	pages = {2721--2735},
	priority = {2},
	title = {On the generalization of soft margin algorithms},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1035123},
	volume = {48},
	year = {2002}
}


@article{Kearns1997,
	author = {Kearns, Michael   and Mansour, Yishay   and Ng, Andrew  Y.  and Ron, Dana  },
	citeulike-article-id = {387743},
	journal = {Machine Learning},
	keywords = {regularization},
	month = {April},
	pages = {7--50},
	priority = {2},
	title = {An Experimental and Theoretical Comparison of Model Selection Methods},
	url = {http://www.springerlink.com/openurl.asp?genre=article\&id=doi:10.1023/A:1007 344726582},
	volume = {27},
	year = {1997}
}


@article{citeulike:387675,
	abstract = {Probability models are estimated by use of penalized log-likelihood criteria related to Akaike (1973) information criterion (AIC) and minimum description length (MDL). The accuracies of the density estimators are shown to be related to the tradeoff between three terms: the accuracy of approximation, the model dimension, and the descriptive complexity of the model classes. The asymptotic risk is determined under conditions on the penalty term, and is shown to be minimax optimal for some cases. As an application, we show that the optimal rate of convergence is simultaneously achieved for log-densities in Sobolev spaces W<sub>2</sub><sup>s</sup>(U) without knowing the smoothness parameter s and norm parameter U in advance. Applications to neural network models and sparse density function estimation are also provided},
	author = {Yang, Yuhong   and Barron, A. R. },
	citeulike-article-id = {387675},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {regularization},
	number = {1},
	pages = {95--116},
	priority = {2},
	title = {An asymptotic property of model selection criteria},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=650993},
	volume = {44},
	year = {1998}
}


@article{citeulike:387674,
	abstract = {Extensive, compares risk bounds to AIC, Mallows C, Vapnik's VC dimension, and other approaches},
	author = {Barron},
	citeulike-article-id = {387674},
	keywords = {regularization},
	priority = {2},
	title = {Risk bounds for model selection via penalization}
}


@article{MR2089748,
	author = {Vorontsov, K. V. },
	citeulike-article-id = {387504},
	journal = {Dokl. Akad. Nauk},
	keywords = {statistical-learning-theory},
	number = {2},
	pages = {175--178},
	priority = {2},
	title = {Combinatorial bounds for the quality of learning by precedents},
	volume = {394},
	year = {2004}
}


@incollection{MR1638361,
	address = {Berlin},
	author = {Haussler, David   and Opper, Manfred  },
	booktitle = {Structures in logic and computer science},
	citeulike-article-id = {385928},
	keywords = {statistical-learning-theory},
	pages = {212--235},
	priority = {2},
	publisher = {Springer},
	series = {Lecture Notes in Comput. Sci.},
	title = {Metric entropy and minimax risk in classification},
	volume = {1261},
	year = {1997}
}


@article{MR1968413,
	author = {Poggio, Tomaso   and Smale, Steve  },
	citeulike-article-id = {384392},
	journal = {Notices Amer. Math. Soc.},
	keywords = {statistical-learning-theory},
	number = {5},
	pages = {537--544},
	priority = {2},
	title = {The mathematics of learning: dealing with data},
	volume = {50},
	year = {2003}
}


@article{citeulike:384355,
	abstract = {The parametric statistical models with suitable regularity conditions have a natural Riemannian manifold structure, given by the information metric. Since the parameters are merely labels for the probability measures, an inferential statement should be formulated through intrinsic objects, invariant under reparametrizations. In this context the estimators will be random objects valued on the manifold corresponding to the statistical model. In spite of these considerations, classical measures of an estimator's performance, like the bias and the mean square error, are clearly dependent on the statistical model parametrizations. In this paper the authors work with extended notions of mean value and moments of random objects which take values on a Hausdorff and connected manifold, equipped with an affine connection. In particular, the Riemannian manifold case is considered. This extension is applied to the bias and the mean square error study in statistical point estimation theory. Under this approach an intrinsic version of the Cramer-Rao lower bound is obtained: a lower bound, which depends on the intrinsic bias and the curvature of the statistical model, for the mean square of the Rao distance, the invariant measure analogous to the mean square error. Further, the behavior of the mean square of the Rao distance of an estimator when conditioning with respect to a sufficient statistic is considered, obtaining intrinsic versions of the Rao-Blackwell and Lehmann-Scheffe theorems. Asymptotic properties complete the study.},
	author = {Oller, J. M.  and Corcuera, J. M. },
	citeulike-article-id = {384355},
	comment = {Coordinate-free description of bias/variance of estimators},
	journal = {The Annals of Statistics},
	keywords = {information-geometry},
	number = {5},
	pages = {1562--1581},
	priority = {2},
	title = {Intrinsic Analysis of Statistical Estimation},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28199510\%2923\%3A5\%3C1562\%3AIAOSE\%3E2.0.CO\%3B2-A},
	volume = {23},
	year = {1995}
}


@article{citeulike:384353,
	abstract = {We determine Riemannian distances between a large class of multivariate probability densities with the same mean, where the Riemannian metric is induced by a weighted Fisher information matrix. We reduce the evaluation of distances to quadrature and in some cases give closed form expressions.},
	author = {Micchelli and Noakes, Lyle  },
	citeulike-article-id = {384353},
	doi = {10.1016/S0047-259X(03)00132-5},
	journal = {Journal of Multivariate Analysis},
	keywords = {information-geometry},
	month = {January},
	number = {1},
	pages = {97--115},
	priority = {2},
	title = {Rao distances},
	url = {http://dx.doi.org/10.1016/S0047-259X(03)00132-5},
	volume = {92},
	year = {2005}
}


@inproceedings{citeulike:383188,
	abstract = {We describe two multivariate statistical dependence measures which can be orthogonally decomposed to separate the effects of pairwise, triplewise, and higher order interactions between the random variables. These decompositions provide a convenient method of analyzing statistical dependencies between large groups of random variables, within which smaller "sub-groups" may exhibit dependencies separately from the rest of the variables. The first dependence measure is a generalization of Pearson's /spl phi//sup 2/, and we decompose it using an orthonormal series expansion of joint probability density functions. The second measure is based on the Kullback-Leibler distance, and we decompose it using information geometry. Applications of these techniques include analysis of neural population recordings and multimodal sensor fusion. We discuss in detail the simple example of three jointly defined binary random variables.},
	author = {Goodman, I. N.  and Johnson, D. H. },
	citeulike-article-id = {383188},
	comment = {Uses Amari's technique to decompose KL divergence into orders. Also decomposes Pearson's phi, which gives similar results, but is easier to compute},
	journal = {Acoustics, Speech, and Signal Processing, 2004. Proceedings. (ICASSP '04). IEEE International Conference on},
	keywords = {information-geometry},
	pages = {ii--1017-20 vol.2},
	priority = {2},
	title = {Orthogonal decompositions of multivariate statistical dependence measures},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1326433},
	volume = {2},
	year = {2004}
}


@misc{citeulike:383186,
	abstract = {A broad set of sufficient conditions that guarantees the existence of the
maximum entropy (maxent) distribution consistent with specified bounds on
certain generalized moments is derived. Most results in the literature are
either focused on the minimum cross-entropy distribution or apply only to
distributions with a bounded-volume support or address only equality
constraints. The results of this work hold for general moment inequality
constraints for probability distributions with possibly unbounded support, and
the technical conditions are explicitly on the underlying generalized moment
functions. An analytical characterization of the maxent distribution is also
derived using results from the theory of constrained optimization in
infinite-dimensional normed linear spaces. Several auxiliary results of
independent interest pertaining to certain properties of convex coercive
functions are also presented.},
	author = {Ishwar, Prakash   and Moulin, Pierre  },
	citeulike-article-id = {383186},
	eprint = {cs.IT/0506013},
	keywords = {information-geometry, maxent},
	month = {Jun},
	priority = {2},
	title = {On the existence and characterization of the maxent distribution under general moment inequality constraints},
	url = {http://arxiv.org/abs/cs.IT/0506013},
	year = {2005}
}


@article{citeulike:383185,
	abstract = {Stochastic interdependence of a probablility distribution on a prod-
uct space is measured by its Kullback-Leibler distance from the exponential fam-
ily of product distributions (called multi-information). Here we investigate low-
dimensional exponential families that contain the maximizers of stochastic inter-
dependence in their closure.
Based on a detailed description of the structure of probablility distributions
with globally maximal multi-information we obtain our main result: The exponen-
tial family of pure pair-interactions contains all global maximizers of the multi-
information in its closure.},
	author = {Ay, Nihat  },
	citeulike-article-id = {383185},
	comment = {Builds up on the "pragmatic structuring" paper},
	keywords = {information-geometry},
	priority = {2},
	title = {Maximizing Multi-Information},
	year = {2003}
}


@article{citeulike:382369,
	citeulike-article-id = {382369},
	journal = {Neural Computation},
	keywords = {maxent},
	priority = {2},
	title = {Maximum Likelihood Set for Estimating a Probability Mass Function},
	url = {http://neco.mitpress.org/cgi/content/abstract/17/7/1508},
	year = {2005}
}


@article{MR1309433,
	author = {Sethuraman, Jayaram  },
	citeulike-article-id = {378732},
	journal = {Statist. Sinica},
	keywords = {process},
	number = {2},
	pages = {639--650},
	priority = {2},
	title = {A constructive definition of {D}irichlet priors},
	volume = {4},
	year = {1994}
}


@article{citeulike:378731,
	abstract = {			In this paper we briefly study the basic idea of Akaike\&\#039;s (1973) information criterion (AIC). Then, we present some recent developments on a new entropic or information complexity (ICOMP) criterion of Bozdogan (1988a, 1988b, 1990, 1994d, 1996, 1998a, 1998b) for model selection. A rationale for ICOMP as a model selection criterion is that it combines a badness-of-fit term (such as minus twice the maximum log likelihood) with a measure of complexity of a model differently than AIC, or its variants, by taking into account the interdependencies of the parameter estimates as well as the dependencies of the model residuals. We operationalize the general form of ICOMP based on the quantification of the concept of overall model complexity in terms of the estimated inverse-Fisher information matrix. This approach results in an approximation to the sum of two Kullback\&\#150;Leibler distances. Using the correlational form of the complexity, we further provide yet another form of ICOMP to take into account the interdependencies (i.e., correlations) among the parameter estimates of the model. Later, we illustrate the practical utility and the importance of this new model selection criterion by providing several real as well as Monte Carlo simulation examples and compare its performance against AIC, or its variants. Copyright 2000 Academic Press.},
	author = {Bozdogan, H. },
	citeulike-article-id = {378731},
	issn = {0022-2496},
	journal = {Journal of Mathematical Psychology},
	keywords = {regularization},
	month = {March},
	number = {1},
	pages = {62--91},
	priority = {2},
	title = {Akaikes Information Criterion and Recent Developments in Information Complexity},
	url = {http://www.ingentaconnect.com/content/ap/mp/2000/00000044/00000001/art01277},
	volume = {44},
	year = {2000}
}


@incollection{NIPS2005_762,
	address = {Cambridge, MA},
	author = {Teh, Yee  W.  and Jordan, Michael  I.  and Beal, Matthew  J.  and Blei, David  M. },
	booktitle = {Advances in Neural Information Processing Systems 17},
	citeulike-article-id = {378729},
	editor = {Saul, Lawrence  K.  and Weiss, Yair   and Bottou, L\'{e}on  },
	keywords = {bayesian, mlrg, process},
	pages = {1385--1392},
	priority = {2},
	publisher = {MIT Press},
	title = {Sharing Clusters among Related Groups: Hierarchical Dirichlet Processes},
	year = {2005}
}


@article{citeulike:376152,
	abstract = {In this paper we briefly study the basic idea of Akaike's (1973) information criterion (AIC). Then, we present some recent developments on a new entropic or information complexity (ICOMP) criterion of Bozdogan (1988a, 1988b, 1990, 1994d, 1996, 1998a, 1998b) for model selection. A rationale for ICOMP as a model selection criterion is that it combines a badness-of-fit term (such as minus twice the maximum log likelihood) with a measure of complexity of a model differently than AIC, or its variants, by taking into account the interdependencies of the parameter estimates as well as the dependencies of the model residuals. We operationalize the general form of ICOMP based on the quantification of the concept of overall model complexity in terms of the estimated inverse-Fisher information matrix. This approach results in an approximation to the sum of two Kullback-Leibler distances. Using the correlational form of the complexity, we further provide yet another form of ICOMP to take into account the interdependencies (i.e., correlations) among the parameter estimates of the model. Later, we illustrate the practical utility and the importance of this new model selection criterion by providing several real as well as Monte Carlo simulation examples and compare its performance against AIC, or its variants. Copyright 2000 Academic Press.},
	address = {The University of Tennessee},
	author = {Bozdogan, H. },
	citeulike-article-id = {376152},
	doi = {10.1006/jmps.1999.1277},
	issn = {0022-2496},
	journal = {J Math Psychol},
	keywords = {ill, regularization},
	month = {March},
	number = {1},
	pages = {62--91},
	priority = {2},
	title = {Akaike's Information Criterion and Recent Developments in Information Complexity.},
	url = {http://dx.doi.org/10.1006/jmps.1999.1277},
	volume = {44},
	year = {2000}
}


@article{citeulike:374587,
	abstract = {In this introductory chapter we seek to cover sufficient differential geometry in order to understand its application to econometrics. It is not intended to be a comprehensive review either of differential geometric theory, or of all the applications that geometry has found in statistics. Rather it is aimed as a rapid tutorial covering the material needed in the rest of this volume and the general literature. The full abstract power of a modern geometric treatment is not always necessary and such a development can often hide in its abstract constructions as much as it illuminates. In section 2 we show how econometric models can take the form of geometrical objects known as manifolds, in particular concentrating on classes of models that are full or curved exponential families. This development of the underlying mathematical structure leads into section 3 , where the tangent space is introduced. It is very helpful to be able to view the tangent space in a number of different but mathematically equivalent ways, and we exploit this throughout the chapter. Section 4 introduces the idea of a metric and more general tensors ilustrated with statistically based examples. Section 5 considers the most important tool that a differential geometric approach offers: the affie connection. We look at applications of this idea to asymptotic analysis, the relationship between geometry and information theory and the problem of the choice of parameterisation. Section 6 introduces key mathematical theorems involving statistical manifolds, duality, projection and finally the statistical application of the classic geometric theorem of Pythagoras. The last two sections look at direct applications of this geometric framework, in particular at the problem of inference in curved families and at the issue of information loss and recovery. Note that, although this chapter aims to give a reasonably precise mathematical development of the required theory, an alternative and Paul Marriott and Mark Salmon perhaps more intuitive approach can be found in the chapter by Critchley, Marriott and Salmon in this volume. For a more exhaustive and detailed review of current geometrical statistical theory see Kass and Vos (1997) or, from a more purely mathematical background, see Murray and Rice (1993).},
	author = {Marriott, P.  and Salmon, M. },
	citeulike-article-id = {374587},
	keywords = {ill, information-geometry},
	priority = {2},
	title = {An Introduction to differential geometry},
	year = {2000}
}


@article{citeulike:374586,
	abstract = {Differential geometry has found fruitful application in statistical inference. In particular, Amari' s (1990) expected geometry is used in higherorder asymptotic analysis and in the study of suffciency and ancillarity. However, we can see three drawbacks to the use of a differential geometric approach in econometrics and statistics more generally. First, the mathematics is unfamiliar and the terms involved can be difficult for the econometrician to appreciate fully. Secondly, their statistical meaning can be less than completely clear. Finally, the fact that, at its core, geometry is a visual subject can be obscured by the mathematical formalism required for a rigorous analysis, thereby hindering intuition. All three drawbacks apply particularly to the differential geometric concept of a non-metric affine connection. The primary objective of this chapter is to attempt to mitigate these drawbacks in the case of Amari' s expected geometric structure on a full exponential family. We aim to do this by providing an elementary account of this structure that is clearly based statistically, accessible geometrically and visually presented. Statistically, we use three natural tools: the score function and its first two moments with respect to the true distribution. Geometrically, we are largely able to restrict attention to tensors; in particular, we are able to avoid the need formally to define an affne connection. To emphasise the visual foundation of geometric analysis we parallel the mathematical development with graphical ilustrations using important examples of full exponential families. Although the analysis is not restricted to this case, we emphasise one-dimensional examples so that simple pictures can be used to ilustrate the underlying geometrical ideas and aid intuition. It turns out that this account also sheds some new light on the choice of parameterisation as discussed by Amari (1990), extending earlier work by Bates and Watts (1980, 1981), Hougaard (1982) and Kass (1984). There are also a number of points of contact between our presentation and Firth (1993). A key feature of our account is that all expectations and induced distributions are taken with respect to one fixed distribution, namely, that assumed to give rise to the data. This is the so-called preferred point geometrical approach developed in Critchley, Marriott and Salmon (1993, 1994), on whose results we draw where appropriate. Our hope is that the following development wil serve to broaden interest in an important and developing area. For a more formal but stil readable treatment of differential geometry, see Dodson and Poston (1977). For broader accounts of the application of differential geometry to statistics , see the review chapters or monographs by Barndorff-Nielsen, Cox and Reid (1986), Kass (1987, 1989), Amari (1990) and Murray and Rice (1993). The chapter is organised as follows. The elementary prerequisites are established in section 1. The key elements of Amari' s expected geometry of general families of distributions are briefly and intuitively reviewed in section 2. In particular, his a-connections are discussed in terms of the characteristic statistical properties of their associated affine parameterisations. Section 3 contains our account of this geometry in the full exponential family case, as outlined above, and section 4 considers the effect of changing the sample size. Preliminaries The general framework Let tj(x, e) E 81 be a p-dimensional parametric family of probability (density) functions. The available data x = (Xl, . . . , )T is modelled as a random sample from some unknown true distribution p(x, cP) M. Let the parameter space 8 be an open connected subset of The family is regarded as a manifold, with the parameter playing the role of a coordinate system on it. Formally, certain regularity conditions are entailed. These are detailed in Amari (1990, p. 16).},
	author = {Critchley, F.  and Marriott, P.  and Salmon, M. },
	citeulike-article-id = {374586},
	keywords = {ill, information-geometry},
	priority = {2},
	title = {An Elementary Treatment of Amari's expected geometry},
	year = {2000}
}


@book{citeulike:372874,
	author = {Lanczos, Cornelius  },
	citeulike-article-id = {372874},
	howpublished = {{Unknown Binding}},
	isbn = {0124358500},
	month = {May},
	priority = {2},
	publisher = {{Academic Press}},
	title = {Space through the ages;: The evolution of geometrical ideas from Pythagoras to Hilbert and Einstein},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0124358500},
	year = {1970}
}


@article{citeulike:167555,
	address = {Cambridge, MA, USA},
	author = {Guyon, Isabelle   and Elisseeff, Andr\&\#233;  },
	citeulike-article-id = {167555},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {feature-selection},
	pages = {1157--1182},
	priority = {2},
	publisher = {MIT Press},
	title = {An introduction to variable and feature selection},
	url = {http://portal.acm.org/citation.cfm?id=944968},
	volume = {3},
	year = {2003}
}


@article{citeulike:370427,
	abstract = {The game which can be taken to lie behind the maximum-entropy principle is studied. Refining previous techniques, new theoretical results are obtained. These results are illustrated by concrete examples pertaining to well-known classical models.},
	author = {Topsoe, F. },
	citeulike-article-id = {370427},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {game-theory, maxent},
	number = {8},
	pages = {2368--2376},
	priority = {2},
	title = {Maximum entropy versus minimum risk and applications to some classical discrete distributions},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1019845},
	volume = {48},
	year = {2002}
}


@inproceedings{DBLP:conf/flairs/Zhang04,
	author = {Zhang, Harry  },
	booktitle = {FLAIRS Conference},
	citeulike-article-id = {370404},
	editor = {Barr, Valerie   and Markov, Zdravko  },
	keywords = {naivebayes},
	priority = {2},
	publisher = {AAAI Press},
	title = {The Optimality of Naive Bayes.},
	year = {2004}
}


@inproceedings{citeulike:369895,
	abstract = {Probabilistic classifiers are developed by assuming generative mod-
els which are product distributions over the original attribute space (as in naive
Bayes) or more involved spaces (as in general Bayesian networks). While this
paradigm has been shown experimentally successful on real world applications,
despite vastly simplified probabilistic assumptions, the question of why these ap-
proaches work is still open.
This paper resolves this question. We show that almost all joint distributions with
a given set of marginals (i.e., all distributions that could have given rise to the clas-
sifier learned) or, equivalently, almost all data sets that yield this set of marginals,
are very close (in terms of distributional distance) to the product distribution on
the marginals; the number of these distributions goes down exponentially with
their distance from the product distribution. Consequently, as we show, for almost
all joint distributions with this set of marginals, the penalty incurred in using the
marginal distribution rather than the true one is small. In addition to resolving the
puzzle surrounding the success of probabilistic classifiers our results contribute
to understanding the tradeoffs in developing probabilistic classifiers and will help
in developing better classifiers.},
	address = {London, UK},
	author = {Garg, Ashutosh   and Roth, Dan  },
	booktitle = {EMCL '01: Proceedings of the 12th European Conference on Machine Learning},
	citeulike-article-id = {369895},
	comment = {gives bounds on 0-1 error in terms of entropy/variational distance},
	isbn = {3540425365},
	keywords = {maxent, naivebayes},
	pages = {179--191},
	priority = {2},
	publisher = {Springer-Verlag},
	title = {Understanding Probabilistic Classifiers},
	url = {http://portal.acm.org/citation.cfm?id=650022},
	year = {2001}
}


@article{citeulike:369866,
	abstract = {			 The classification problem is considered in which an output variable y assumes discrete values with respective probabilities that depend upon the simultaneous values of a set of input variables x = {x\_1,....,x\_n}. At issue is how error in the estimates of these probabilities affects classification error when the estimates are used in a classification rule. These effects are seen to be somewhat counter intuitive in both their strength and nature. In particular the bias and variance components of the estimation error combine to influence classification in a very different way than with squared error on the probabilities themselves. Certain types of (very high) bias can be canceled by low variance to produce accurate classification. This can dramatically mitigate the effect of the bias associated with some simple estimators like \&\#147;naive\&\#148; Bayes, and the bias induced by the curse-of-dimensionality on nearest-neighbor procedures. This helps explain why such simple methods are often competitive with and sometimes superior to more sophisticated ones for classification, and why \&\#147;bagging/aggregating\&\#148; classifiers can often improve accuracy. These results also suggest simple modifications to these procedures that can (sometimes dramatically) further improve their classification performance. },
	author = {Friedman, J. H. },
	citeulike-article-id = {369866},
	issn = {1384-5810},
	journal = {Data Mining and Knowledge Discovery},
	keywords = {naivebayes},
	number = {1},
	pages = {55--77},
	priority = {2},
	title = {On Bias, Variance, 0/1-Loss, and the Curse-of-Dimensionality},
	url = {http://www.ingentaconnect.com/content/klu/dami/1997/00000001/00000001/00127246},
	volume = {1},
	year = {1997}
}


@techreport{citeulike:369865,
	author = {Spiegelhalter, David  },
	citeulike-article-id = {369865},
	keywords = {regularization},
	priority = {2},
	title = {Bayesian deviance, the effective number of parameters, and the comparison of arbitrarily complex},
	year = {1998}
}


@article{MR882765,
	author = {Hastie, Trevor  },
	citeulike-article-id = {369864},
	journal = {Amer. Statist.},
	keywords = {toread},
	number = {1},
	pages = {16--20},
	priority = {2},
	title = {A closer look at the deviance},
	volume = {41},
	year = {1987}
}


@article{citeulike:369834,
	abstract = {The problem of model selection is considerably important for acquiring higher levels of generalization capability in supervised learning. In this article, we propose a new criterion for model selection, the subspace information criterion (SIC), which is a generalization of Mallows's C(L). It is assumed that the learning target function belongs to a specified functional Hilbert space and the generalization error is defined as the Hilbert space squared norm of the difference between the learning result function and target function. SIC gives an unbiased estimate of the generalization error so defined. SIC assumes the availability of an unbiased estimate of the target function and the noise covariance matrix, which are generally unknown. A practical calculation method of SIC for least-mean-squares learning is provided under the assumption that the dimension of the Hilbert space is less than the number of training examples. Finally, computer simulations in two examples show that SIC works well even when the number of training examples is small.},
	address = {Department of Computer Science, Graduate School of Information Science and Engineering, Tokyo Institute of Technology, Meguro-ku, Tokyo, 152-8552, Japan.},
	author = {Sugiyama, M.  and Ogawa, H. },
	citeulike-article-id = {369834},
	comment = {Estimates true function and variance from data, and hence generalization error},
	doi = {10.1162/08997660152469387},
	issn = {0899-7667},
	journal = {Neural Comput},
	keywords = {regularization},
	month = {August},
	number = {8},
	pages = {1863--1889},
	priority = {2},
	title = {Subspace information criterion for model selection.},
	url = {http://dx.doi.org/10.1162/08997660152469387},
	volume = {13},
	year = {2001}
}


@incollection{MR1789298,
	address = {Cambridge},
	author = {Davidson, Russell  },
	booktitle = {Applications of differential geometry to econometrics},
	citeulike-article-id = {369831},
	keywords = {information-geometry},
	pages = {151--183},
	priority = {2},
	publisher = {Cambridge Univ. Press},
	title = {Efficiency and robustness in a geometrical perspective},
	year = {2000}
}


@inproceedings{citeulike:369781,
	address = {London, UK},
	author = {Amari, Shun-Ichi   and Murata, Noboru  },
	booktitle = {IWANN '97: Proceedings of the International Work-Conference on Artificial and Natural Neural Networks},
	citeulike-article-id = {369781},
	isbn = {3540630473},
	keywords = {regularization},
	pages = {284--293},
	priority = {2},
	publisher = {Springer-Verlag},
	title = {Statistical Analysis of Regularization Constant - From Bayes, MDL and NIC Points of View},
	url = {http://portal.acm.org/citation.cfm?id=646367.690131},
	year = {1997}
}


@article{citeulike:368263,
	citeulike-article-id = {368263},
	journal = {Neural Computation},
	keywords = {approximation},
	priority = {2},
	title = {Edgeworth Approximation of Multivariate Differential Entropy},
	url = {http://neco.mitpress.org/cgi/content/abstract/17/9/1903},
	year = {2005}
}


@article{citeulike:368258,
	abstract = {Nonquadratic regularizers, in particular the l<sub>1</sub> norm regularizer can yield sparse solutions that generalize well. In this work we propose the generalized subspace information criterion (GSIC) that allows to predict the generalization error for this useful family of regularizers. We show that under some technical assumptions GSIC is an asymptotically unbiased estimator of the generalization error. GSIC is demonstrated to have a good performance in experiments with the l<sub>1</sub> norm regularizer as we compare with the network information criterion (NIC) and cross- validation in relatively large sample cases. However in the small sample case, GSIC tends to fail to capture the optimal model due to its large variance. Therefore, also a biased version of GSIC is introduced,which achieves reliable model selection in the relevant and challenging scenario of high-dimensional data and few samples},
	author = {Tsuda, K.  and Sugiyama, M.  and Miller, K. R. },
	citeulike-article-id = {368258},
	journal = {Neural Networks, IEEE Transactions on},
	keywords = {regularization},
	number = {1},
	pages = {70--80},
	priority = {2},
	title = {Subspace information criterion for nonquadratic regularizers-Model selection for sparse regressors},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=977272},
	volume = {13},
	year = {2002}
}


@article{citeulike:367050,
	abstract = {We introduce an entropic prior for multinomial parameter estimation problems and solve for its maximum a posteriori (MAP) estimator. The prior is a bias for maximally structured and minimally ambiguous models. In conditional probability models with hidden state, iterative MAP estimation drives weakly supported parameters toward extinction, effectively turning them off. Thus, structure discovery is folded into parameter estimation. We then establish criteria for simplifying a probabilistic model's graphical structure by trimming parameters and states, with a guarantee that any such deletion will increase the posterior probability of the model. Trimming accelerates learning by sparsifying the model. All operations monotonically and maximally increase the posterior probability, yielding structure-learning algorithms only slightly slower than parameter estimation via expectation-maximization and orders of magnitude faster than search-based structure induction. When applied to hidden Markov model training, the resulting models show superior generalization to held-out test data. In many cases the resulting models are so sparse and concise that they are interpretable, with hidden states that strongly correlate with meaningful categories.},
	author = {Brand, Matthew  },
	citeulike-article-id = {367050},
	journal = {Neural Computation},
	keywords = {maxent},
	priority = {2},
	title = {Structure Learning in Conditional Probability Models via an Entropic Prior and Parameter Extinction},
	url = {http://neco.mitpress.org/cgi/content/abstract/11/5/1155},
	year = {1999}
}


@techreport{RePEc:fth:socaec:m8803,
	author = {Zellner, A. },
	citeulike-article-id = {366084},
	comment = {http://hunch.net/?p=65\#comments},
	institution = {Southern California - Department of Economics},
	keywords = {bayesian, information-theory, philosophy, toread},
	number = {m8803},
	priority = {2},
	title = {Optimal Information-Processing And Bayes' Theorem},
	type = {Papers},
	year = {1988}
}


@inproceedings{nips02-AA28,
	address = {Cambridge, MA},
	author = {Ng, A. Y.  and Jordan, M. I. },
	booktitle = {Advances in Neural Information Processing Systems 14},
	citeulike-article-id = {365760},
	editor = {Dietterich, T. G.  and Becker, S.  and Ghahramani, Z. },
	keywords = {generative-discriminative},
	pages = {841--848},
	priority = {2},
	publisher = {MIT Press},
	title = {On Discriminative vs. Generative Classifiers: A comparison of logistic regression and naive Bayes},
	year = {2002}
}


@inproceedings{citeulike:365120,
	address = {New York, NY, USA},
	author = {Zhang, Dell   and Chen, Xi   and Lee, Wee  S. },
	booktitle = {SIGIR '05: Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval},
	citeulike-article-id = {365120},
	doi = {10.1145/1076034.1076081},
	isbn = {1595930345},
	keywords = {information-geometry, svm},
	pages = {266--273},
	priority = {2},
	publisher = {ACM Press},
	title = {Text classification with kernels on the multinomial manifold},
	url = {http://portal.acm.org/citation.cfm?id=1076081},
	year = {2005}
}


@article{citeulike:365088,
	author = {Efron, Brad  },
	citeulike-article-id = {365088},
	comment = {statisticians specify inferential bias, Breiman just gives boxes with lots of knobs},
	journal = {Statistical Science},
	keywords = {philosophy},
	number = {3},
	pages = {218--219},
	priority = {2},
	title = {[Statistical Modeling: The Two Cultures]: Comment},
	url = {http://links.jstor.org/sici?sici=0883-4237\%28200108\%2916\%3A3\%3C218\%3A\%5BMTTCC\%3E2.0.CO\%3B2-8},
	volume = {16},
	year = {2001}
}


@article{citeulike:361889,
	abstract = {This paper reviews Daniels' saddlepoint approximation to the distribution of the mean of a random sample, and the many aspects of second order asymptotic inference that have been developed from it. These include Barndorff-Nielsen's approximation to the distribution of the maximum likelihood estimate, Bartlett factors for the likelihood ratio statistic and approximations to predictive and conditional likelihood. The emphasis is on statistical applications of the saddlepoint method. The intention is to provide fairly broad coverage of the literature and to indicate possibilities for future development. An annotated bibliography is included.},
	author = {Reid, N. },
	citeulike-article-id = {361889},
	comment = {better than normal approximation or Edgeworth expansions},
	journal = {Statistical Science},
	keywords = {approximation},
	number = {2},
	pages = {213--227},
	priority = {2},
	title = {Saddlepoint Methods and Statistical Inference},
	url = {http://links.jstor.org/sici?sici=0883-4237\%28198805\%293\%3A2\%3C213\%3ASMASI\%3E2.0.CO\%3B2-Z},
	volume = {3},
	year = {1988}
}


@article{citeulike:361869,
	abstract = {We consider inference for a scalar parameter \&\#968; in the presence of one or more nuisance parameters. The nuisance parameters are required to be orthogonal to the parameter of interest, and the construction and interpretation of orthogonalized parameters is discussed in some detail. For purposes of inference we propose a likelihood ratio statistic constructed from the conditional distribution of the observations, given maximum likelihood estimates for the nuisance parameters. We consider to what extent this is preferable to the profile likelihood ratio statistic in which the likelihood function is maximized over the nuisance parameters. There are close connections to the modified profile likelihood of Barndorff-Nielsen (1983). The normal transformation model of Box and Cox (1964) is discussed as an illustration.},
	author = {Cox, D. R.  and Reid, N. },
	citeulike-article-id = {361869},
	comment = {effects of parameter orthogonality on inference in presence of nuisance parameters
},
	keywords = {parametrization},
	priority = {2},
	title = {Parameter Orthogonality and Approximate Conditional Inference},
	url = {http://links.jstor.org/sici?sici=0035-9246\%281987\%2949\%3A1\%3C1\%3APOAACI\%3E2.0.CO\%3B2-E}
}


@article{MR2125507,
	author = {Farsipour, Sanjari  N.  and Ghazvininejad, D. },
	citeulike-article-id = {361865},
	journal = {Pakistan J. Statist.},
	keywords = {ill, minimax},
	number = {3},
	pages = {329--334},
	priority = {2},
	title = {Minimax estimation of a bounded binomial parameter},
	volume = {20},
	year = {2004}
}


@article{ormo_maxent99,
	abstract = {We describe an algorithm to efficiently compute 
maximum entropy densities, i.e. densities maximizing the 
Shannon entropy \Gamma 
R 
p(x) log p(x)dx under a set of constraints 
E[g i (x)] = c i , i = 1; : : : ; n: Our method is based on an algo­ 
rithm by Zellner and Highfield, which has been found not to 
converge under a variety of circumstances. To demonstrate 
that our method overcomes these difficulties, we conduct 
numerous experiments for the special case g i (x) = x i ; n = 4. 
An extensive table of results for this case is available on the 
World Wide Web.},
	author = {Ormoneit, D.  and White, H. },
	citeulike-article-id = {361199},
	journal = {Econometric Reviews},
	keywords = {maxent, optimization},
	number = {2},
	pages = {127--140},
	priority = {2},
	title = {An Efficient Algorithm to Compute Maximum Entropy Densities},
	volume = {18},
	year = {1999}
}


@article{citeulike:361192,
	abstract = {In this paper, reference priors are derived for three cases where partial information is available. If a subjective conditional prior is given, two reasonable methods are proposed for finding the marginal reference prior. If, instead, a subjective marginal prior is available, a method for defining the conditional reference prior is proposed. A sufficient condition is then given under which this conditional reference prior agrees with the conditional reference prior derived in the first stage of the reference prior algorithm of Berger \${\tt\&amp;}\$ Bernardo (1989, 1992). Finally, under the assumption of independence, a method for finding marginal reference priors is also proposed. Various examples are given to illustrate the methods.},
	author = {Sun, D.  and Berger, J. O. },
	citeulike-article-id = {361192},
	journal = {Biometrika},
	keywords = {prior},
	number = {1},
	pages = {55--71},
	priority = {2},
	title = {Reference Priors with Partial Information},
	url = {http://links.jstor.org/sici?sici=0006-3444\%28199803\%2985\%3A1\%3C55\%3ARPWPI\%3E2.0.CO\%3B2-N},
	volume = {85},
	year = {1998}
}


@techreport{citeulike:361191,
	author = {Yang, R.  and Berger, J. O. },
	citeulike-article-id = {361191},
	keywords = {prior},
	priority = {2},
	title = {A catalog of noninformative priors}
}


@article{citeulike:361176,
	abstract = {In maximum entropy (ME) modeling, the information discrepancy between two distributions is measured in terms of their entropy difference. In discrimination information statistics the information discrepancy between two distributions is measured in terms of the Kullback-Leibler function (i.e., relative entropy or cross-entropy). This article presents an equivalence between Kullback-Leibler functions and entropy differences involving an ME distribution. Based on this equivalence, the concept of information discrimination (ID) distinguishability is introduced as a unifying framework for the two methods of measuring information discrepancy between distributions. Applications of ID distinguishability as diagnostics for examining robustness of parametric procedures and sensitivity of nonparametric statistics across parametric families of distributions is proposed. The equivalence results facilitates estimation of Kullback-Leibler functions in terms of entropy estimates. Application of the ID distinguishability to modeling failure data brings a new dimension into entropy estimation--entropy estimation based on the hazard function. ID statistics for modeling lifetime distributions with increasing failure rates are studied. Two illustrative examples are analyzed.},
	author = {Soofi, Ehsan  S.  and Ebrahimi, Nadar   and Habibullah, Mohamed  },
	citeulike-article-id = {361176},
	comment = {Gives a table of exponential families over continuous domains and their corresponding sufficient statistics},
	journal = {Journal of the American Statistical Association},
	keywords = {econometrics, maxent},
	number = {430},
	pages = {657--668},
	priority = {2},
	title = {Information Distinguishability with Application to Analysis of Failure Data},
	url = {http://links.jstor.org/sici?sici=0162-1459\%28199506\%2990\%3A430\%3C657\%3AIDWATA\%3E2.0.CO\%3B2-0},
	volume = {90},
	year = {1995}
}


@article{citeulike:361173,
	abstract = {The purpose of this article is to discuss the intricacies of quantifying information in some statistical problems. The aim is to develop a general appreciation for the meanings of information functions rather than their mathematical use. This theme integrates fundamental aspects of the contributions of Kullback, Lindley, and Jaynes and bridges chaos to probability modeling. A synopsis of information-theoretic statistics is presented in the form of a pyramid with Shannon at the vertex and a triangular base that signifies three distinct variants of quantifying information: discrimination information (Kullback), mutual information (Lindley), and maximum entropy information (Jaynes). Examples of capturing information by the maximum entropy (ME) method are discussed. It is shown that the ME approach produces a general class of logit models capable of capturing various forms of sample and nonsample information. Diagnostics for quantifying information captured by the ME logit models are given, and decomposition of information into orthogonal components is presented. Basic geometry is used to display information graphically in a simple example. An overview of quantifying information in chaotic systems is presented, and a discrimination information diagnostic for studying chaotic data is introduced. Finally, some brief comments about future research are given.},
	author = {Soofi, Ehsan  S. },
	citeulike-article-id = {361173},
	journal = {Journal of the American Statistical Association},
	keywords = {maxent},
	number = {428},
	pages = {1243--1254},
	priority = {2},
	title = {Capturing the Intangible Concept of Information},
	url = {http://links.jstor.org/sici?sici=0162-1459\%28199412\%2989\%3A428\%3C1243\%3ACTICOI\%3E2.0.CO\%3B2-X},
	volume = {89},
	year = {1994}
}


@article{citeulike:361171,
	abstract = {The unified framework of information theoretic statistics was established by Kullback (1959). Since then numerous information indices have been developed in various contexts. This paper represents many of these indices in a unified context. The unification thread is the discrimination information function: information indices are all logarithmic measures of discrepancy between two probability distributions. First, we present a summary of informational aspects of the basic information functions, a unification of various information-theoretic modeling approaches, and some explication in terms of traditional measures. We then tabulate a unified representation of assortments of information indices developed in the literature for maximum entropy modeling, covariate information, and influence diagnostics. The subjects of these indices include parametric model fitting, nonparametric entropy estimation, categorical data analysis, the linear and exponential family regression, and time series. The coverage however, is not exhaustive. The tabulation includes sampling theory and Bayesian indices, but the focus is on interpretation as descriptive measures and inferential properties are noted tangentially. Finally, applications of some information indices are illustrated through modeling duration data for Sprint's churned customer and choice of long distance provider.},
	author = {Soofi, E. S.  and Retzer, J. J. },
	citeulike-article-id = {361171},
	doi = {10.1016/S0304-4076(01)00111-7},
	journal = {Journal of Econometrics},
	keywords = {econometrics},
	month = {March},
	number = {1-2},
	pages = {17--40},
	priority = {2},
	title = {Information indices: unification and applications},
	url = {http://dx.doi.org/10.1016/S0304-4076(01)00111-7},
	volume = {107},
	year = {2002}
}


@article{citeulike:361170,
	abstract = {1. Introduction
2. Brief summary of recent history

2.1. Information and entropy—background
2.2. Maximum entropy—background
2.3. Information, entropy and maximum-entropy revisited
2.4. Information, entropy, complexity and non-linearity

3. Information and entropy econometrics and this volume
4. Conclusion},
	author = {Golan, Amos  },
	citeulike-article-id = {361170},
	doi = {10.1016/S0304-4076(01)00110-5},
	journal = {Journal of Econometrics},
	keywords = {econometrics, maxent},
	month = {March},
	number = {1-2},
	pages = {1--15},
	priority = {2},
	title = {Information and Entropy Econometrics},
	url = {http://dx.doi.org/10.1016/S0304-4076(01)00110-5},
	volume = {107},
	year = {2002}
}


@techreport{citeulike:360674,
	abstract = {Logistic regression is a workhorse of statistics and is closely related to methods used in Machine Learning, including the Perceptron and the Support Vector Machine. This note compares eight different algorithms for computing the maximum a-posteriori parameter estimate. A full derivation of each algorithm is given. In particular, a new derivation of Iterative Scaling is given which applies more generally than the conventional one. A new derivation is also given for the Modified Iterative Scaling algorithm of Collins et al (2002). Most of the algorithms operate in the primal space, but can also work in dual space. All algorithms are compared in terms of computational complexity by experiments on large data sets. The fastest algorithms turn out to be conjugate gradient ascent and quasi-Newton algorithms, which far outstrip Iterative Scaling and its variants.},
	author = {Minka},
	citeulike-article-id = {360674},
	keywords = {optimization},
	priority = {2},
	title = {A comparison of numerical optimizers for logistic regression},
	year = {2004}
}


@unpublished{citeulike:359058,
	abstract = {In this paper, we propose a family of surrogate maximization (SM) algorithms
for multi-class logistic regression models (also called conditional exponential
models). An SM algorithm aims at turning an otherwise intractable maximization
problem into a tractable one by iterating two steps. The S-step computes
a tractable surrogate function to substitute the original objective function, and the
M-step seeks to maximize this surrogate function. We apply SM algorithms to
logistic regression models, leading to the standard SM, generalized SM, gradient
SM, and quadratic SM algorithms. Compared with Newton's method, these SM
algorithms dramatically save computational costs when either the dimensionality
or number of data samples is huge. Finally, we demonstrate the efcacy of these
SM algorithms and compare their empirical performance on text categorization.},
	author = {Zhang, Zhihua   and Kwok, James  T.  and Yeung, Dit-Yan   and Wang, Gang  },
	citeulike-article-id = {359058},
	keywords = {optimization},
	priority = {2},
	title = {Convexity, Surrogate Functions and Iterative Optimization in Multi-class Logistic Regression Models},
	url = {http://citeseer.ist.psu.edu/694923.html},
	year = {2004}
}


@article{Bohning1992,
	author = {Bohning, Dankmar  },
	citeulike-article-id = {359019},
	journal = {Annals of the Institute of Statistical Mathematics (Historical Archive)},
	keywords = {optimization},
	month = {March},
	pages = {197--200},
	priority = {2},
	title = {Multinomial logistic regression algorithm},
	url = {http://www.springerlink.com/openurl.asp?genre=article\&id=doi:10.1007/BF0004 8682},
	volume = {44},
	year = {1992}
}


@inproceedings{2003AIPC..659..307S,
	abstract = {In this contribution, we study the problem of prior selection arising in Bayesian inference. There is an extensive literature on the construction of non informative priors and the subject seems far from a definite solution [1]. Here we revisit this subject with differential geometry tools and propose to construct the prior in a Bayesian decision theoretic framework. We show how the construction of a prior by projection is the best way to take into account the restriction to a particular family of parametric models. For instance, we apply this procedure to the curved parametric families where the ignorance is directly expressed by the relative geometry of the restricted model in the wider model containing it.},
	author = {Snoussi, H.  and Mohammad-Djafari, A. },
	booktitle = {AIP Conf. Proc. 659: Bayesian Inference and Maximum Entropy Methods in Science and Engineering},
	citeulike-article-id = {358676},
	keywords = {information-geometry},
	pages = {307--327},
	priority = {2},
	title = {Information geometry and prior selection},
	url = {http://www.cs.orst.edu/~bulatov/papers/snoussi-information.pdf},
	year = {2003}
}


@article{citeulike:358673,
	abstract = {When learning processes depend on samples but not on the order of the information
in the sample, then the Bernoulli distribution is relevant and Bernstein polynomials
enter into the analysis. We derive estimates of the approximation of the entropy function
x log x that are sharper than the bounds from Voronovskaja’s theorem. In this
way we get the correct asymptotics for the Kullback–Leibler distance for an encoding
problem.},
	address = {Orlando, FL, USA},
	author = {Braess, Dietrich   and Sauer, Thomas  },
	citeulike-article-id = {358673},
	doi = {10.1016/j.jat.2004.04.010},
	issn = {0021-9045},
	journal = {J. Approx. Theory},
	keywords = {minimax},
	month = {June},
	number = {2},
	pages = {187--206},
	priority = {2},
	publisher = {Academic Press, Inc.},
	title = {Bernstein polynomials and learning theory},
	url = {http://portal.acm.org/citation.cfm?id=1022239.1022244},
	volume = {128},
	year = {2004}
}


@article{citeulike:358671,
	abstract = {Neyman and Scott (1948) define the incidental parameter problem. In panel data with T observations per individual and unobservable individual- specific effects, the inconsistency of the maximum likelihood estimator of the common parameters is in general of the order 1/T. This paper considers the integrated likelihood estimator and develops the integrated moment estimator. It shows that the inconsistency of the integrated likelihood estimator reduces from 1/T to 1/T2 if an information orthogonal parametrization is used. It derives information orthogonal moment functions for the general linear model and the index model with weakly exogenous regressors and thereby offers an approximate solution for the incidental parameter problem for a wide range of models. It argues that reparametrizations are easier in a Bayesian framework and shows how to use the 1/T2- result to increase the robustness against the choice of mixing distribution. The integrated likelihood estimator is consistent and adaptive for asympototics in which T proportional to N to the power alpha where alpha is larger than 1/3. The paper also shows that likelihood methods that use sufficient statistics for the individual-specific effects can be viewed as a special case of the integrated likelihood estimator.},
	author = {Woutersen},
	citeulike-article-id = {358671},
	comment = {Reparametrization improves bayesian inference},
	keywords = {bayesian, parametrization},
	priority = {2},
	title = {Robustness against incidental parameters and mixing distributions},
	url = {http://www.cs.orst.edu/~bulatov/papers/woutersen-robustness.pdf}
}


@article{MR1644287,
	author = {Wieczorkowski, R. },
	citeulike-article-id = {358663},
	journal = {Statist. Decisions},
	keywords = {ill, minimax},
	number = {3},
	pages = {289--298},
	priority = {2},
	title = {Calculating the minimax estimator of a binomial probability with entropy loss function and its comparison with other estimators of a binomial probability},
	url = {http://www.cs.orst.edu/~bulatov/papers/wieczorkowski.pdf},
	volume = {16},
	year = {1998}
}


@article{MR1782342,
	author = {Marchand, {\'e}ric   and Macgibbon, Brenda  },
	citeulike-article-id = {358662},
	journal = {Statist. Decisions},
	keywords = {ill, minimax},
	number = {2},
	pages = {129--167},
	priority = {2},
	title = {Minimax estimation of a constrained binomial proportion},
	url = {http://www.cs.orst.edu/~bulatov/papers/marchand-minimax.pdf},
	volume = {18},
	year = {2000}
}


@article{citeulike:358661,
	abstract = {In this paper we present a direct and simple approach to obtain bounds on the asymptotic minimax risk for the estimation of  constrained binomial and multinomial proportions. Quadratic,  normalized quadratic and entropy loss are considered and it is demonstrated that in all  cases linear estimators are asymptotically minimax optimal. For the quadratic loss function the asymptotic minimax risk does not change unless a neighborhood of the point \$1/2\$ is excluded by the restrictions on the parameter space. For the two other loss functions the asymptotic behavior of the minimax risk is not changed by such additional knowledge about the location of the unknown probability. The results are also extended to the problem of minimax estimation of a vector of constrained multinomial probabilities.},
	author = {Braess, Dietrich   and Dette, Holger  },
	citeulike-article-id = {358661},
	comment = {(private-note)In the unconstrained case (i.e. we have no prior information on p, so not
in your setting where p<=.5), there's a minimax estimator of p, which is
(k + beta)  / (n+2beta)  where beta = sqrt(n) / 2.
(k is number of heads observed, n the total number of flips)

I'm getting this from
http://sankhya.isical.ac.in/search/66\_4/2004041.html
http://sankhya.isical.ac.in/search/servlet/Weight?path=66\_4/2004041.pdf

But to continue the summary:
There seem to be papers covering special cases of restricted parameter
sets, e.g. if we know .5-a <= p <= .5 +a, for some 0<a<.5.

Also, when we know p comes from a small range close to 0, getting smaller
with n.

The paper I'm giving the link for gives some asymptotic minimaxity results
for the case with the restricted parameter space.},
	journal = {Sankhya: The Indian Journal of Statistics},
	keywords = {minimax},
	number = {4},
	priority = {2},
	title = {The Asymptotic Minimax Risk for the Estimation of  Constrained Binomial and Multinomial Probabilities},
	volume = {66},
	year = {2004}
}


@article{citeulike:356507,
	abstract = {Let (F/sub k/)/sub k/spl ges/1/ be a nested family of parametric classes of densities with finite Vapnik-Chervonenkis dimension. Let f be a probability density belonging to F/sub k//sup */, where k/sup */ is the unknown smallest integer such that f/spl isin/F/sub k/. Given a random sample X/sub 1/,...,X/sub n/ drawn from f, an integer k/sub 0//spl ges/1 and a real number /spl alpha//spl isin/(0,1), we introduce a new, simple, explicit /spl alpha/-level consistent testing procedure of the  hypothesis {H/sub 0/:k/sup */=k/sub 0/} versus the alternative {H/sub 1/:k/sup *//spl ne/k/sub 0/}. Our method is inspired by the combinatorial tools developed in Devroye and Lugosi and it includes a wide range of density models, such as mixture models, neural networks, or exponential families.},
	author = {Biau, G.  and Devroye, L. },
	citeulike-article-id = {356507},
	comment = {Could this be used as a feature induction stopping criterion?},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {model-selection},
	number = {3},
	pages = {576--581},
	priority = {2},
	title = {A note on density model size testing},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1273672},
	volume = {50},
	year = {2004}
}


@inproceedings{citeulike:356504,
	abstract = {This paper presents a linear programming approach to discriminative training. We first define a measure of discrimination of an arbitrary conditional probability model on a set of labeled training data. We consider maximizing discrimination on a parametric family of exponential models that arises naturally in the maximum entropy framework. We show that this optimization problem is globally convex in R<sup>n</sup>, and is moreover piecewise linear on R<sup>n</sup>. We propose a solution that involves solving a series of linear programming problems. We provide a characterization of global optimizers. We compare this framework with those of minimum classification error and maximum entropy},
	author = {Papineni, K. A. },
	citeulike-article-id = {356504},
	journal = {Acoustics, Speech, and Signal Processing, 1999. ICASSP '99. Proceedings., 1999 IEEE International Conference on},
	keywords = {generative-discriminative, optimization},
	pages = {561--564 vol.2},
	priority = {2},
	title = {Discriminative training via linear programming},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=759722},
	volume = {2},
	year = {1999}
}


@article{MR1353574,
	author = {Achcar, J. A. },
	citeulike-article-id = {356500},
	journal = {Pakistan J. Statist.},
	keywords = {exponential-families, ill, parametrization},
	number = {3},
	pages = {597--616},
	priority = {2},
	title = {Some aspects of reparametrization for statistical models},
	url = {http://www.cs.orst.edu/~bulatov/papers/achcar-some.pdf},
	volume = {10},
	year = {1994}
}


@article{citeulike:355918,
	abstract = {We introduce a new class of upper bounds on the log partition function of a Markov random field (MRF). This quantity plays an important role in various contexts, including approximating marginal distributions, parameter estimation, combinatorial enumeration, statistical decision theory, and large-deviations bounds. Our derivation is based on concepts from convex duality and information geometry: in particular, it exploits mixtures of distributions in the exponential domain, and the Legendre mapping between exponential and mean parameters. In the special case of convex combinations of tree-structured distributions, we obtain a family of variational problems, similar to the Bethe variational problem, but distinguished by the following desirable properties: i) they are convex, and have a unique global optimum; and ii) the optimum gives an upper bound on the log partition function. This optimum is defined by stationary conditions very similar to those defining fixed points of the sum-product algorithm, or more generally, any local optimum of the Bethe variational problem. As with sum-product fixed points, the elements of the optimizing argument can be used as approximations to the marginals of the original model. The analysis extends naturally to convex combinations of hypertree-structured distributions, thereby establishing links to Kikuchi approximations and variants.},
	author = {Wainwright, M. J.  and Jaakkola, T. S.  and Willsky, A. S. },
	citeulike-article-id = {355918},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {exponential-families},
	number = {7},
	pages = {2313--2335},
	priority = {2},
	title = {A new class of upper bounds on the log partition function},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1459045},
	volume = {51},
	year = {2005}
}


@inproceedings{MR1660616,
	address = {River Edge, NJ},
	author = {Subanar},
	booktitle = {Proceedings of the Second Asian Mathematical Conference 1995 (Nakhon Ratchasima)},
	citeulike-article-id = {355422},
	keywords = {ill, minimax},
	pages = {531--539},
	priority = {2},
	publisher = {World Sci. Publishing},
	title = {Some problems on minimax estimation},
	url = {http://www.cs.orst.edu/~bulatov/papers/subanar-some.pdf},
	year = {1998}
}


@article{MR2108735,
	author = {Wencheko, E.  and Wijekoon, P. },
	citeulike-article-id = {355420},
	journal = {Statist. Papers},
	keywords = {estimation, exponential-families},
	number = {1},
	pages = {101--115},
	priority = {2},
	title = {Improved estimation of the mean in one-parameter exponential families with known coefficient of variation},
	url = {http://www.cs.orst.edu/~bulatov/papers/wencheko-improved.pdf},
	volume = {46},
	year = {2005}
}


@article{Mohammadi2002,
	abstract = {The best invariant estimators for a continuous distribution function under monotone transformations and the weighted Cram\'{e}r-von Mises loss function were introduced by O. P. Aggarwal \ref[Ann. Math. Statist. 26 (1955), 450--463; MR0070914 (17,54d)]. Q. Q. Yu \ref[Ann. Inst. Statist. Math. 44 (1992), no. 4, 729--735; MR1200526 (94a:62013)] and Yu and E. Phadia \ref[Ann. Statist. 20 (1992), no. 4, 2192--2195; MR1193337 (93j:62018)] proved that the best invariant estimator is minimax under the loss function \$L(F,d)=\int\vert F(t)-d(t)\vert^rh(F(t))dF(t)\$, where \$r\geq1\$, \$h\$ is a nonnegative weight function and \$d\$ is a nondecreasing function from \$\Bbb R\$ to \$[0,1]\$ and under the Kolmogorov-Smirnov loss function \$L(F,d)=\sup\_t\vert F(t)-d(t)\vert\$. The authors consider another measure of the distance between a probability distribution and its estimator. They use analogues of the well-known Kullback-Leibler divergence or entropy loss functions. The first loss function is of the form \$\$L(F,d)=\int\left(\frac{F(t)}{d(t)}- \ln\left(\frac{F(t)}{d(t)}\right)-1\right) dF(t).\$\$ The second one is \$\$\multline L(F,d)=\\n\int\left(F(t)\ln\left(\frac{F(t)}{d(t)}\right)+(1-F(t)) \ln\left(\frac{1-F(t)}{1-d(t)}\right)\right)dF(t).\endmultline\$\$ Let \$X=(X\_1,\dots,X\_n)\$ be a random sample of size \$n\$ from an unknown absolutely continuous distribution function \$F(t)\$ and let \$Y=(Y\_1,\dots,Y\_n), Y\_0=0,Y\_{n+1}=1\$ be the vector of order statistics of \$X\_1,\dots,X\_n\$. The authors prove that the best invariant estimator of \$F(t)\$ under these two entropy loss functions is of the form \$\$d\_0=d\_0(Y,t)=\sum\_{i=0}^n\frac{i+1}{n+2}{\bold 1}(Y\_{i}\leq t\leq Y\_{i+1}), i=0,1,\dots,n.\$\$ This invariant estimator is the same as the best invariant estimator of a continuous distribution function under the mean square error loss function. It is minimax for any sample size \$n\geq1\$.},
	author = {Mohammadi, Leila   and van Zwet, Willem  R. },
	citeulike-article-id = {355381},
	journal = {Metrika},
	keywords = {minimax},
	pages = {31--42},
	priority = {2},
	title = {Minimax invariant estimation of a continuous distribution function under entropy loss},
	url = {http://www.springerlink.com/openurl.asp?genre=article\&\#38;id=doi:10.1007/s001840100152},
	volume = {56},
	year = {2002}
}


@article{citeulike:353473,
	address = {Cambridge, MA, USA},
	author = {Blei, David  M.  and Ng, Andrew  Y.  and Jordan, Michael  I. },
	citeulike-article-id = {353473},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {mlrg, process},
	pages = {993--1022},
	priority = {0},
	publisher = {MIT Press},
	title = {Latent dirichlet allocation},
	url = {http://portal.acm.org/citation.cfm?id=944937},
	volume = {3},
	year = {2003}
}


@techreport{citeulike:353461,
	abstract = {Recently there have been several studies that examined monetary policy under model uncertainty. These studies formulated uncertainty in a number of different ways. One of the prominent ways to formulate model uncertainty is to form a non-parametric set of perturbations around some nominal model where the set is structured so that the uncertainty is focused on potentially important weaknesses of the model. Unfortunately, previous efforts were unable to compute exact optimal policy rules under this general formulation of uncertainty. Moreover, for those special cases when the robust rules were computed, the degree of their aggressiveness was often counterintuitive in light of conventional Brainard/Bayesian wisdom that policy under uncertainty should be conservative. This paper,therefore, consists of three different exercises concerning minimax analysis of policy rules under model uncertainty. First, the minimax approach is compared with the Bayesian one in a stylized Brainard (1967) setting. Strong similarities between recommendations of the two approaches are found. Next, a more realistic setting such as in Onatski and Stock (1999) is considered. A characterization of the worst possible models corresponding to the max part of the minimax scheme is given. It is shown that the worst possible models for very aggressive rules, such as the H-infinity rule, have realistic economic structure whereas those for passive rules, such as the actual Fed's policy, are not plausible. Thus, the results of minimax analysis presented in Onatski and Stock (1999) might be biased against the passive rules. Finally, exact optimal minimax policy rules for the case of slowly time-varying uncertainty in the case of the Rudebusch and Svensson's (1998) model are computed. The optimal rule under certainty turns out to be robust to moderate deviations from Rudebusch and Svensson's model.},
	author = {Onatski, Alexei  },
	citeulike-article-id = {353461},
	institution = {Econometric Society},
	keywords = {bayesian, minimax},
	number = {1818},
	priority = {2},
	title = {Minimax Analysis of Monetary Policy Under Model Uncertainty},
	url = {http://ideas.repec.org/p/ecm/wc2000/1818.html},
	year = {2000}
}


@article{citeulike:353460,
	abstract = {This paper considers the nonparametric estimator of a mixing density \$g\$ using the random sample \$X\_1,\cdots, X\_n\$ with probability density function \$f(x;g)=\int^{\theta^*}\_0f(x\big |\theta)g(\theta)d\theta\$, where \$f(·\big |\theta)\$ is a known parametric family of probability density functions. It is assumed that \$f(x\big |\theta)=c(\theta)q(x)\theta^x\$ for all \$x=0,1,2,\cdots\$ and \$q(x)>0\$. It is shown in the paper that estimators based upon orthogonal polynomials can achieve the minimax convergence rate with respect to the integrated mean square error. The convergence rate is shown to be logarithmic. The estimator is tested empirically for finite sample performance. It is demonstrated that estimators based upon orthogonal polynomials perform better than kernel mixing density estimators. To improve performance, an estimator of the optimal truncation parameter is given.},
	author = {Loh, Wei  L.  and Zhang, Cun  H. },
	citeulike-article-id = {353460},
	journal = {Scand. J. Statist.},
	keywords = {exponential-families},
	number = {1},
	pages = {15--32},
	priority = {2},
	title = {Estimating mixing densities in exponential family models for discrete variables},
	url = {http://www.cs.orst.edu/~bulatov/papers/loh-estimating.pdf},
	volume = {24},
	year = {1997}
}


@article{MR2076066,
	abstract = {We examine the question of which statistic or statistics should be used in order to recover information important for inference. We take a global geometric viewpoint, developing the local geometry of Amari. By examining the behaviour of simple geometric models, we show how not only the local curvature properties of parametric families but also the global geometric structure can be of crucial importance in finite-sample analysis. The tool we use to explore this global geometry is the Karhunen-Lo\`{e}ve decomposition. Using global geometry, we show that the maximum likelihood estimate is the most important one-dimensional summary of information, but that traditional methods of information recovery beyond the maximum likelihood estimate can perform poorly. We also use the global geometry to construct better information summaries to be used with the maximum likelihood estimate.},
	author = {Marriott, Paul   and Vos, Paul  },
	citeulike-article-id = {353459},
	journal = {Bernoulli},
	keywords = {information-geometry},
	number = {4},
	pages = {639--649},
	priority = {2},
	title = {On the global geometry of parametric models and information recovery},
	url = {http://www.cs.orst.edu/~bulatov/papers/marriott-on.pdf},
	volume = {10},
	year = {2004}
}


@article{citeulike:353458,
	address = {Hingham, MA, USA},
	author = {Evgeniou, Theodoros   and Pontil, Massimiliano   and Poggio, Tomaso  },
	citeulike-article-id = {353458},
	doi = {10.1023/A:1008110632619},
	issn = {0920-5691},
	journal = {Int. J. Comput. Vision},
	keywords = {svm},
	month = {June},
	number = {1},
	pages = {9--13},
	priority = {2},
	publisher = {Kluwer Academic Publishers},
	title = {Statistical Learning Theory: A Primer},
	url = {http://portal.acm.org/citation.cfm?id=355340},
	volume = {38},
	year = {2000}
}


@article{citeulike:353457,
	abstract = {Definitions are given for orthogonal parameters in the context of Bayesian inference and likelihood inference. The exact orthogonalizing transformations are derived for both cases, and the connection between the two settings is made precise. These parametrizations simplify the interpretation of likelihood functions and posterior distributions. Further, they make numerical maximization and integration procedures easier to apply. Several applications are studied},
	author = {Tibshirani, Robert   and Wasserman, Larry  },
	citeulike-article-id = {353457},
	journal = {Canad. J. Statist.},
	keywords = {information-geometry, parametrization},
	number = {1},
	pages = {163--173},
	priority = {2},
	title = {Some aspects of the reparametrization of statistical models},
	url = {http://www.cs.orst.edu/~bulatov/papers/tibshirani-some.pdf},
	volume = {22},
	year = {1994}
}


@article{citeulike:353426,
	abstract = {Statistical learning theory was introduced in the late 1960's. Until the 1990's it was a purely theoretical analysis of the problem of function estimation from a given collection of data. In the middle of the 1990's new types of learning algorithms (called support vector machines) based on the developed theory were proposed. This made statistical learning theory not only a tool for the theoretical analysis but also a tool for creating practical algorithms for estimating multidimensional functions. This article presents a very general overview of statistical learning theory including both theoretical and algorithmic aspects of the theory. The goal of this overview is to demonstrate how the abstract learning theory established conditions for generalization which are more general than those discussed in classical statistical paradigms and how the understanding of these conditions inspired new algorithmic approaches to function estimation problems},
	author = {Vapnik, V. N. },
	citeulike-article-id = {353426},
	journal = {Neural Networks, IEEE Transactions on},
	keywords = {statistical-learning-theory},
	number = {5},
	pages = {988--999},
	priority = {2},
	title = {An overview of statistical learning theory},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=788640},
	volume = {10},
	year = {1999}
}


@unpublished{citeulike:352476,
	author = {Syversveen},
	citeulike-article-id = {352476},
	keywords = {bayesian, prior},
	priority = {2},
	title = {Noninformative Bayesian priors. Interpretation and problems with construction and applications},
	url = {http://www.math.ntnu.no/preprint/statistics/1998/S3-1998.ps},
	year = {1998}
}


@article{citeulike:352254,
	abstract = {The normal, Poisson, gamma, binomial, and negative binomial distributions are univariate natural exponential families with quadratic variance functions (the variance is at most a quadratic function of the mean). Only one other such family exists. Much theory is unified for these six natural exponential families by appeal to their quadratic variance property, including infinite divisibility, cumulants, orthogonal polynomials, large deviations, and limits in distribution.},
	author = {Morris, Carl  N. },
	citeulike-article-id = {352254},
	journal = {The Annals of Statistics},
	keywords = {exponential-families},
	number = {1},
	pages = {65--80},
	priority = {2},
	title = {Natural Exponential Families with Quadratic Variance Functions},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28198203\%2910\%3A1\%3C65\%3ANEFWQV\%3E2.0.CO\%3B2-C},
	volume = {10},
	year = {1982}
}


@article{citeulike:351875,
	abstract = {The goal of this paper is to complete results available about I-projections, reverse I-projections, and their generalized versions, with focus on linear and exponential families. Pythagorean-like identities and inequalities are revisited and generalized, and generalized maximum-likelihood (ML) estimates for exponential families are introduced. The main tool is a new concept of extension of exponential families, based on our earlier results on convex cores of measures.},
	author = {Csiszar, I.  and Matus, F. },
	citeulike-article-id = {351875},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {information-geometry},
	number = {6},
	pages = {1474--1490},
	priority = {2},
	title = {Information projections revisited},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1201070},
	volume = {49},
	year = {2003}
}


@article{citeulike:351869,
	abstract = {An exponential family or mixture family of probability distributions has a natural hierarchical structure. This paper gives an \&ldquo;orthogonal\&rdquo; decomposition of such a system based on information geometry. A typical example is the decomposition of stochastic dependency among a number of random variables. In general, they have a complex structure of dependencies. Pairwise dependency is easily represented by correlation, but it is more difficult to measure effects of pure triplewise or higher order interactions (dependencies) among these variables. Stochastic dependency is decomposed quantitatively into an \&ldquo;orthogonal\&rdquo; sum of pairwise, triplewise, and further higher order dependencies. This gives a new invariant decomposition of joint entropy. This problem is important for extracting intrinsic interactions in firing patterns of an ensemble of neurons and for estimating its functional connections. The orthogonal decomposition is given in a wide class of hierarchical structures including both exponential and mixture families. As an example, we decompose the dependency in a higher order Markov chain into a sum of those in various lower order Markov chains},
	author = {Amari, S. I. },
	citeulike-article-id = {351869},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {information-geometry},
	number = {5},
	pages = {1701--1711},
	priority = {2},
	title = {Information geometry on hierarchy of probability distributions},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=930911},
	volume = {47},
	year = {2001}
}


@misc{citeulike:351142,
	abstract = {A new method is presented for modeling low-dimensional representations of high-dimensional

multinomial and compositional data. The data are fit to subfamilies of the multinomial family

which are defined using the multinomial information geometry. These collections of spherical

subfamilies have a number of advantages over the affine subfamilies contructed by methods such

as canonical and correspondence analysis, traditionally fit to such data. First, they can describe

more complex shapes in ...},
	author = {Gous, Alan  },
	citeulike-article-id = {351142},
	keywords = {exponential-families, information-geometry},
	priority = {2},
	title = {Spherical Subfamily Models},
	url = {http://citeseer.ist.psu.edu/gous99spherical.html}
}


@book{citeulike:342890,
	author = {Brown, Lawrence  D. },
	citeulike-article-id = {342890},
	howpublished = {Paperback},
	isbn = {0940600102},
	keywords = {exponential-families},
	month = {June},
	priority = {2},
	publisher = {{Inst of Mathematical Statistic}},
	title = {Fundamentals of Statistical Exponential Families (Ims Lecture Notes-Monograph Ser.: Vol.9)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0940600102},
	year = {1987}
}


@article{citeulike:341748,
	author = {Cox, R. T. },
	citeulike-article-id = {341748},
	journal = {American Journal of Physics},
	keywords = {bayesian, philosophy},
	number = {1},
	pages = {1--13},
	priority = {2},
	title = {Probability, Frequency and Reasonable Expectation},
	url = {http://www.cs.orst.edu/~bulatov/papers/cox-probability.pdf},
	volume = {14},
	year = {1946}
}


@article{citeulike:342776,
	abstract = {This paper contains an analysis of the performance of Bayes conditional-mean parameter estimators. The main result is that on a finite parameter space such estimates exhibit a mean-square error that diminishes exponentially with the number of observations, the observations being assumed to be independent. Two situations are discussed: true parameter included in the parameter space and true parameter not included in the parameter space. In the former instance only very general assumptions are required to demonstrate the exponential convergence rate. In the latter case the existence of an information function must be invoked. Comments on the continuous-parameter-space realization of the estimator and a discussion of the convergence mechanism are also included.},
	author = {Liporace, L. },
	citeulike-article-id = {342776},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {estimation},
	number = {6},
	pages = {665--669},
	priority = {2},
	title = {Variance of Bayes estimates},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1054718},
	volume = {17},
	year = {1971}
}


@misc{citeulike:335752,
	author = {Klein, Dan  },
	citeulike-article-id = {335752},
	journal = {ACL},
	keywords = {maxent},
	priority = {2},
	title = {Maxent Models, Conditional Estimation, and Optimization, without the Magic},
	url = {http://www.cs.orst.edu/~bulatov/papers/klein-maxent.pdf},
	year = {2003}
}


@misc{citeulike:335751,
	author = {Barndorff-Nielsen},
	citeulike-article-id = {335751},
	journal = {Encyclopedia of Statistical Sciences},
	keywords = {exponential-families},
	priority = {2},
	title = {Exponential Families},
	url = {http://www.cs.orst.edu/~bulatov/papers/barndorff-exponential.pdf}
}


@article{citeulike:333455,
	abstract = {The Fisher information J(X) of a random variable X under a translation parameter appears in information theory in the classical proof of the entropy-power inequality (EPI). It enters the proof of the EPI via the De-Bruijn identity, where it measures the variation of the differential entropy under a Gaussian perturbation, and via the convolution inequality J(X+Y)<sup>-1</sup>\&ges;J(X)<sup>-1</sup>+J(Y) <sup>-1</sup> (for independent X and Y), known as the Fisher information inequality (FII). The FII is proved in the literature directly, in a rather involved way. We give an alternative derivation of the FII, as a simple consequence of a \&ldquo;data processing inequality\&rdquo; for the Cramer-Rao lower bound on parameter estimation},
	author = {Zamir, R. },
	citeulike-article-id = {333455},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {estimation, fisher-information},
	number = {3},
	pages = {1246--1250},
	priority = {2},
	title = {A proof of the Fisher information inequality via a data processing argument},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=669301},
	volume = {44},
	year = {1998}
}


@unpublished{citeulike:331414,
	abstract = {Generalized Linear Model (GLM) theory represents a significant advance beyond linear regression theory, specifically in expanding the choice of probability distributions from the Normal to the Natural exponential Family. This Primer is intended for GLM users seeking a handy reference on the model's distribution assumptions. The Exponential Family of Distributions is introducted with emphasis on variance structures that may be suitable for aggregate loss models in property casualty insurance.},
	author = {Clark, David  R. },
	citeulike-article-id = {331414},
	keywords = {exponential-families},
	priority = {2},
	title = {A Primer on the Exponential Family of Distributions},
	url = {http://www.casact.org/pubs/dpp/dpp04/04dpp117.pdf}
}


@article{citeulike:329236,
	author = {Vajda, Igor   and van der Meulen, E. },
	citeulike-article-id = {329236},
	journal = {Applications of Mathematics},
	keywords = {exponential-families},
	month = {February},
	number = {1},
	pages = {23--51},
	priority = {2},
	title = {Global statistical information in exponential experiments and selection of exponential models},
	url = {http://www.metapress.com/link.asp?id=U4766W5K13P04R73},
	volume = {43},
	year = {1998}
}


@article{citeulike:327218,
	abstract = {			The maximum entropy approach is a flexible and powerful tool for density approximation. This paper proposes a sequential updating method to calculate the maximum entropy density subject to known moment constraints. Instead of imposing the moment constraints simultaneously, the sequential updating method incorporates the moment constraints into the calculation from lower to higher moments and updates the density estimates sequentially. The proposed method is employed to approximate the size distribution of U.S. family income. Empirical evidence demonstrates the efficiency of this method.},
	author = {Wu, X. },
	citeulike-article-id = {327218},
	comment = {Some obvious errors in formulas and notation.},
	doi = {10.1016/S0304-4076(03)00114-3	},
	issn = {0304-4076},
	journal = {Journal of Econometrics},
	keywords = {maxent},
	month = {August},
	number = {2},
	pages = {347--354},
	priority = {1},
	title = {Calculation of maximum entropy densities with application to income distribution},
	url = {http://dx.doi.org/10.1016/S0304-4076(03)00114-3	},
	volume = {115},
	year = {2003}
}


@article{citeulike:327209,
	abstract = {The maximum-entropy approach to the solution of underdetermined inverse problems is studied in detail in the context of the classical moment problem. In important special cases, such as the Hausdorff moment problem, we establish necessary and sufficient conditions for the existence of a maximum-entropy solution and examine the convergence of the resulting sequence of approximations. A number of explicit illustrations are presented. In addition to some elementary examples, we analyze the maximum-entropy reconstruction of the density of states in harmonic solids and of dynamic correlation functions in quantum spin systems. We also briefly indicate possible applications to the Lee\&\#150;Yang theory of Ising models, to the summation of divergent series, and so on. The general conclusion is that maximum entropy provides a valuable approximation scheme, a serious competitor of traditional Pad\&eacute;-like procedures. 


Journal of Mathematical Physics is copyrighted by The American Institute of Physics.},
	author = {Mead, Lawrence  R.  and Papanicolaou, N. },
	citeulike-article-id = {327209},
	doi = {10.1063/1.526446},
	journal = {Journal of Mathematical Physics},
	keywords = {maxent, nonparametric},
	number = {8},
	pages = {2404--2417},
	priority = {2},
	publisher = {AIP},
	title = {Maximum entropy in the problem of moments},
	url = {http://scitation.aip.org/getabs/servlet/GetabsServlet?prog=normal\&id=JMAPAQ000025000008002404000001\&idtype=cvips\&gifs=yes},
	volume = {25},
	year = {1984}
}


@article{citeulike:312685,
	author = {Walter, G.  and Blum, J. },
	citeulike-article-id = {312685},
	journal = {The Annals of Statistics},
	keywords = {density, nonparametric},
	number = {2},
	pages = {328--340},
	priority = {2},
	title = {Probability Density Estimation Using Delta Sequences},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28197903\%297\%3A2\%3C328\%3APDEUDS\%3E2.0.CO\%3B2-6},
	volume = {7},
	year = {1979}
}


@article{citeulike:312402,
	abstract = {A new projection pursuit algorithm for exploring multivariate data is presented that has both statistical and computational advantages over previous methods. A number of practical issues concerning its application are addressed. A connection to multivariate density estimation is established, and its properties are investigated through simulation studies and application to real data. The goal of exploratory projection pursuit is to use the data to find low- (one-, two-, or three-) dimensional projections that provide the most revealing views of the full-dimensional data. With these views the human gift for pattern recognition can be applied to help discover effects that may not have been anticipated in advance. Since linear effects are directly captured by the covariance structure of the variable pairs (which are straightforward to estimate) the emphasis here is on the discovery of nonlinear effects such as clustering or other general nonlinear associations among the variables. Although arbitrary nonlinear effects are impossible to parameterize in full generality, they are easily recognized when presented in a low-dimensional visual representation of the data density. Projection pursuit assigns a numerical index to every projection that is a functional of the projected data density. The intent of this index is to capture the degree of nonlinear structuring present in the projected distribution. The pursuit consists of maximizing this index with respect to the parameters defining the projection. Since it is unlikely that there is only one interesting view of a multivariate data set, this procedure is iterated to find further revealing projections. After each maximizing projection has been found, a transformation is applied to the data that removes the structure present in the solution projection while preserving the multivariate structure that is not captured by it. The projection pursuit algorithm is then applied to these transformed data to find additional views that may yield further insight. This projection pursuit algorithm has potential advantages over other dimensionality reduction methods that are commonly used for data exploration. It focuses directly on the \&quot;interestingness\&quot; of a projection rather than indirectly through the interpoint distances. This allows it to be unaffected by the scale and (linear) correlational structure of the data, helping it to overcome the \&quot;curse of dimensionality\&quot; that tends to plague methods based on multidimensional scaling, parametric mapping, cluster analysis, and principal components.},
	author = {Friedman, Jerome  H. },
	citeulike-article-id = {312402},
	journal = {Journal of the American Statistical Association},
	keywords = {density, nonparametric},
	number = {397},
	pages = {249--266},
	priority = {2},
	title = {Exploratory Projection Pursuit},
	url = {http://links.jstor.org/sici?sici=0162-1459\%28198703\%2982\%3A397\%3C249\%3AEPP\%3E2.0.CO\%3B2-E},
	volume = {82},
	year = {1987}
}


@article{citeulike:312399,
	address = {Cambridge, MA, USA},
	author = {Girolami, Mark  },
	citeulike-article-id = {312399},
	doi = {10.1162/089976602317250942},
	issn = {0899-7667},
	journal = {Neural Comput.},
	keywords = {nonparametric, svm},
	month = {March},
	number = {3},
	pages = {669--688},
	priority = {3},
	publisher = {MIT Press},
	title = {Orthogonal series density estimation and the kernel eigenvalue problem},
	url = {http://portal.acm.org/citation.cfm?id=638929.638938},
	volume = {14},
	year = {2002}
}


@article{citeulike:311688,
	address = {San Francisco, CA, USA},
	author = {Cox, R. T. },
	citeulike-article-id = {311688},
	isbn = {1558601252},
	keywords = {bayesian, philosophy},
	pages = {353--365},
	priority = {2},
	publisher = {Morgan Kaufmann Publishers Inc.},
	title = {Probability, frequency and reasonable expectation},
	url = {http://portal.acm.org/citation.cfm?id=84628.85340},
	year = {1990}
}


@article{citeulike:311683,
	abstract = {We describe and illustrate Bayesian inference in models for density estimation using mixtures of Dirichlet processes. These models provide natural settings for density estimation and are exemplified by special cases where data are modeled as a sample from mixtures of normal distributions. Efficient simulation methods are used to approximate various prior, posterior, and predictive distributions. This allows for direct inference on a variety of practical issues, including problems of local versus global smoothing, uncertainty about density estimates, assessment of modality, and the inference on the numbers of components. Also, convergence results are established for a general class of normal mixture models.},
	author = {Escobar, Michael  D.  and West, Mike  },
	citeulike-article-id = {311683},
	journal = {Journal of the American Statistical Association},
	keywords = {nonparametric},
	number = {430},
	pages = {577--588},
	priority = {2},
	title = {Bayesian Density Estimation and Inference Using Mixtures},
	url = {http://links.jstor.org/sici?sici=0162-1459\%28199506\%2990\%3A430\%3C577\%3ABDEAIU\%3E2.0.CO\%3B2-8},
	volume = {90},
	year = {1995}
}


@article{citeulike:311335,
	abstract = {We consider the problem of estimating a density function from a sequence identically distributed observations xi taking value in X [subset of] d. The estimation procedure constructs a convex mixture of "basis" densities and estimates the parameters using the maximum likelihood method. Viewing the error as a combination of two terms, the approximation error measuring the adequacy of the model, and the estimation error resulting from the finiteness of the sample size, we derive upper bounds to the expected total error, thus obtaining bounds for the rate of convergence. These results then allow us to derive explicit expressions relating the sample complexity and model complexity. Copyright (c) 1996 Elsevier Science Ltd.},
	author = {Zeevi, Assaf  J.  and Meir, Ronny  },
	citeulike-article-id = {311335},
	comment = {Gives expression for best number of parameters (mixture components) in terms of sample size (and a constant depending on true distribution/basis used)},
	doi = {10.1016/S0893-6080(96)00037-8},
	journal = {Neural Networks},
	keywords = {density},
	month = {January},
	number = {1},
	pages = {99--109},
	priority = {2},
	title = {Density Estimation Through Convex Combinations of Densities: Approximation and Estimation Bounds},
	url = {http://dx.doi.org/10.1016/S0893-6080(96)00037-8},
	volume = {10},
	year = {1997}
}


@article{citeulike:157632,
	abstract = {Probability density functions are estimated by the method of maximum likelihood in sequences of regular exponential families. This method is also familiar as entropy maximization subject to empirical constraints. The approximating families of log-densities that we consider are polynomials, splines and trigonometric series. Bounds on the relative entropy (Kullback-Leibler distance) between the true density and the estimator are obtained and rates of convergence are established for log-density functions assumed to have square integrable derivatives.},
	author = {Barron, Andrew  R.  and Sheu, Chyong-Hwa  },
	citeulike-article-id = {157632},
	journal = {The Annals of Statistics},
	keywords = {density, exponential-families, nonparametric},
	number = {3},
	pages = {1347--1369},
	priority = {2},
	title = {Approximation of Density Functions by Sequences of Exponential Families},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28199109\%2919\%3A3\%3C1347\%3AAODFBS\%3E2.0.CO\%3B2-T},
	volume = {19},
	year = {1991}
}


@article{citeulike:311302,
	abstract = {The authors introduce an index of resolvability that is proved to bound the rate of convergence of minimum complexity density estimators as well as the information-theoretic redundancy of the corresponding total description length. The results on the index of resolvability demonstrate the statistical effectiveness of the minimum description-length principle as a method of inference. The minimum complexity estimator converges to true density nearly as fast as an estimator based on prior knowledge of the true subclass of densities. Interpretations and basic properties of minimum complexity estimators are discussed. Some regression and classification problems that can be examined from the minimum description-length framework are considered},
	author = {Barron, A. R.  and Cover, T. M. },
	citeulike-article-id = {311302},
	journal = {Information Theory, IEEE Transactions on},
	keywords = {density, information-theory},
	number = {4},
	pages = {1034--1054},
	priority = {2},
	title = {Minimum complexity density estimation},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=86996},
	volume = {37},
	year = {1991}
}


@article{citeulike:311300,
	author = {Vlassis, Nikos   and Likas, Aristidis  },
	citeulike-article-id = {311300},
	doi = {10.1023/A:1013844811137},
	journal = {Neural Processing Letters},
	keywords = {density, nonparametric},
	month = {January},
	number = {1},
	pages = {77--87},
	priority = {2},
	title = {A Greedy EM Algorithm for Gaussian Mixture Learning},
	url = {http://dx.doi.org/10.1023/A:1013844811137},
	volume = {15},
	year = {2002}
}


@article{citeulike:310633,
	abstract = {It is known that the naive Bayesian classifier tyically works well on discrete
data. All continuous attrbutes then need to be discretized beforehand for such
applications. An inappropriate range of discretization intervals may result in
degradation of performance. In this paper, we review previous work on
continuous featue discretization and conduct an empircal evaluation of an
improved method called Clustering of -I Interval Discretization (CIoNI).
CloNI tries to reduce the number of -I intervals in the dataets by iteratively
combining two consecutive intervals together, according to their median distace
until a stopping criteria is met. We also show that even though C4.5 decision
trees can handle continuous featues, we can significantly improve its
performance in some domains if those featues were discretized in advance. In
our empircal results, using discretized instead of continuous featues in C4.5
never significantly degrades its accuracy. Our results indicate that CloNI
reliably performs as well as or better than the Proportonal k-interval
Discretization (PKI) on all domains, and gives a competitive classification
performance for both smaller and larger datasets.},
	author = {Chotirat},
	citeulike-article-id = {310633},
	journal = {4th  International Conference on Data Mining Including Building Application for CRM \& Competitive Intelligence},
	keywords = {discretization},
	month = {December},
	priority = {2},
	title = {CloNI: Clustering of Sqrt(N)-interval discretization},
	url = {http://www.cs.orst.edu/~bulatov/papers/ratanamahatana-cloni.pdf},
	year = {2003}
}


@book{citeulike:309524,
	author = {Gilad-Bachrach, Ran   and Navot, Amir   and Tishby, Naftali  },
	citeulike-article-id = {309524},
	journal = {Lecture Notes in Computer Science},
	keywords = {bayesian, statistical-learning-theory},
	month = {January},
	pages = {549--563},
	priority = {2},
	title = {Bayes and Tukey Meet at the Center Point},
	url = {http://www.metapress.com/link.asp?id=HJY7H2M8G3HTTULY},
	volume = {3120},
	year = {2004}
}


@article{citeulike:306414,
	abstract = {Advances in computation and the fast and cheap computational facilities now available to statisticians have had a significant impact upon statistical research, and especially the development of nonparametric data analysis procedures. In particular, theoretical and applied research on nonparametric density estimation has had a noticeable influence on related topics, such as nonparametric regression, nonparametric discrimination, and nonparametric pattern recognition. This article reviews recent developments in nonparametric density estimation and includes topics that have been omitted from review articles and books on the subject. The early density estimation methods, such as the histogram, kernel estimators, and orthogonal series estimators are still very popular, and recent research on them is described. Different types of restricted maximum likelihood density estimators, including order-restricted estimators, maximum penalized likelihood estimators, and sieve estimators, are discussed, where restrictions are imposed upon the class of densities or on the form of the likelihood function. Nonparametric density estimators that are data-adaptive and lead to locally smoothed estimators are also discussed; these include variable partition histograms, estimators based on statistically equivalent blocks, nearest-neighbor estimators, variable kernel estimators, and adaptive kernel estimators. For the multivariate case, extensions of methods of univariate density estimation are usually straightforward but can be computationally expensive. A method of multivariate density estimation that did not spring from a univariate generalization is described, namely, projection pursuit density estimation, in which both dimensionality reduction and density estimation can be pursued at the same time. Finally, some areas of related research are mentioned, such as nonparametric estimation of functionals of a density, robust parametric estimation, semiparametric models, and density estimation for censored and incomplete data, directional and spherical data, and density estimation for dependent sequences of observations.},
	author = {Izenman, Alan  J. },
	citeulike-article-id = {306414},
	journal = {Journal of the American Statistical Association},
	keywords = {nonparametric},
	number = {413},
	pages = {205--224},
	priority = {2},
	title = {Recent Developments in Nonparametric Density Estimation},
	url = {http://links.jstor.org/sici?sici=0162-1459\%28199103\%2986\%3A413\%3C205\%3ARDINDE\%3E2.0.CO\%3B2-C},
	volume = {86},
	year = {1991}
}


@article{citeulike:304144,
	author = {Borwein, D.  and Borwein, J. M.  and Marechal, P. },
	citeulike-article-id = {304144},
	journal = {The American Mathematical Monthly},
	keywords = {maxent},
	number = {6},
	pages = {517--527},
	priority = {2},
	title = {Surprise Maximization},
	url = {http://links.jstor.org/sici?sici=0002-9890\%28200006\%2F07\%29107\%3A6\%3C517\%3ASM\%3E2.0.CO\%3B2-\%23},
	volume = {107},
	year = {2000}
}


@article{citeulike:304109,
	author = {Parzen, Emanuel  },
	citeulike-article-id = {304109},
	journal = {Statistical Science},
	keywords = {philosophy},
	number = {3},
	pages = {224--226},
	priority = {2},
	title = {[Statistical Modeling: The Two Cultures]: Comment},
	url = {http://links.jstor.org/sici?sici=0883-4237\%28200108\%2916\%3A3\%3C224\%3A\%5BMTTCC\%3E2.0.CO\%3B2-9},
	volume = {16},
	year = {2001}
}


@misc{citeulike:300478,
	abstract = {A fundamental problem of statistical data analysis, distribution density estimation by experimental data, is considered. A new method with optimal asymptotic behavior, the root density estimator, is developed. The method proposed may be applied to its full extent to solve the statistical inverse problem of quantum mechanics, namely, estimating the psi function on the basis of the results of mutually complementing experiments.},
	author = {Bogdanov, Yu  },
	citeulike-article-id = {300478},
	eprint = {physics/0211109},
	keywords = {nonparametric, physics},
	month = {Nov},
	priority = {2},
	title = {Statistical Inverse Problem},
	url = {http://arxiv.org/abs/physics/0211109},
	year = {2002}
}


@book{citeulike:300228,
	abstract = {{Although there has been a surge of interest in density estimation in recent years, much of the published research has been concerned with purely technical matters with insufficient emphasis given to the technique's practical value. Furthermore, the subject has been rather inaccessible to the general statistician.The account presented in this book places emphasis on topics of methodological importance, in the hope that this will facilitate broader practical application of density estimation and also encourage research into relevant theoretical work. The book also provides an introduction to the subject for those with general interests in statistics. The important role of density estimation as a graphical technique is reflected by the inclusion of more than 50 graphs and figures throughout the text.Several contexts in which density estimation can be used are discussed, including the exploration and presentation of data, nonparametric discriminant analysis, cluster analysis, simulation and the bootstrap, bump hunting, projection pursuit, and the estimation of hazard rates and other quantities that depend on the density. This book includes general survey of methods available for density estimation. The Kernel method, both for univariate and multivariate data, is discussed in detail, with particular emphasis on ways of deciding how much to smooth and on computation aspects. Attention is also given to adaptive methods, which smooth to a greater degree in the tails of the distribution, and to methods based on the idea of penalized likelihood.}},
	author = {Silverman, B. W. },
	citeulike-article-id = {300228},
	howpublished = {Hardcover},
	isbn = {0412246201},
	keywords = {nonparametric},
	month = {April},
	priority = {2},
	publisher = {{Chapman \& Hall/CRC}},
	title = {Density Estimation for Statistics and Data Analysis},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0412246201},
	year = {1986}
}


@article{citeulike:300227,
	abstract = {We wish to estimate the probability density g y. that produced an observed random sample of vectors y1, y2 , . . . , yn. Estimates of g y. are traditionally constructed in two quite different ways: by maximum likelihood fitting within some parametric family such as the normal or by nonparametric methods such as kernel density estimation. These two methods can be combined by putting an exponential family ‘‘through’’ a kernel estimator. These are the specially designed exponential families mentioned in the title. Poisson regression methods play a major role in calculations concerning such families.},
	author = {Efron, Bradley   and Tibshirani, Robert  },
	citeulike-article-id = {300227},
	journal = {The Annals of Statistics},
	keywords = {nonparametric},
	number = {6},
	pages = {2431--2461},
	priority = {2},
	title = {Using specially designed exponential families for density estimation,},
	url = {http://projecteuclid.org/Dienst/UI/1.0/Summarize/euclid.aos/1032181161},
	volume = {24},
	year = {1996}
}


@inproceedings{citeulike:300226,
	abstract = {Support Vector Machines (SVMs) and related kernel methods have become increasingly popular tools for data mining tasks such as classification, regression, and novelty detection. The goal of this tutorial is to provide an intuitive explanation of SVMs from a geometric perspective. The classification problem is used to investigate the basic concepts behind SVMs and to examine their strengths and weaknesses from a data mining perspective. While this overview is not comprehensive, it does provide resources for those interested in further exploring SVMs.},
	author = {Kp, Bennett   and Campbell, C. },
	citeulike-article-id = {300226},
	comment = {Interesting view of SVM optimization as separating modified convex hulls.},
	journal = {SIGKDD explorations},
	keywords = {svm},
	priority = {2},
	title = {Support Vector Machines: Hype or Hallelujah?},
	url = {http://scholar.google.com/url?sa=U\&\#38;q=http://www.mm.di.uoa.gr/~rouvas/ssi/sigkdd/sigkdd.vol2.2/bennett.ps},
	year = {2000}
}


@article{citeulike:300170,
	abstract = {An exact and easily computable expression for the mean integrated squared error (MISE) for the kernel estimator of a general normal mixture density, is given for Gaussian kernels of arbitrary order. This provides a powerful new way of understanding density estimation which complements the usual tools of simulation and asymptotic analysis. The family of normal mixture densities is very flexible and the formulae derived allow simple exact analysis for a wide variety of density shapes. A number of applications of this method giving important new insights into kernel density estimation are presented. Among these is the discovery that the usual asymptotic approximations to the MISE can be quite inaccurate, especially when the underlying density contains substantial fine structure and also strong evidence that the practical importance of higher order kernels is surprisingly small for moderate sample sizes.},
	author = {Marron, J. S.  and Wand, M. P. },
	citeulike-article-id = {300170},
	comment = {Gives 15 sample densities to test density fitting algorithms},
	journal = {The Annals of Statistics},
	keywords = {density, nonparametric},
	month = {June},
	number = {2},
	pages = {712--736},
	priority = {2},
	title = {Exact Mean Integrated Squared Error},
	url = {http://yaroslav.hopto.org/papers/marron-exact-mean.pdf},
	volume = {20},
	year = {1992}
}


@article{citeulike:300166,
	abstract = {The traditional kernel density estimator of an unknown density is by construction completely nonparametric in the sense that it has no preferences and will work reasonably well for all shapes. The present paper develops a class of semiparametric methods that are designed to work better than the kernel estimator in a broad nonparametric neighbourhood of a given parametric class of densities, for example, the normal, while not losing much in precision when the true density is far from the parametric class. The idea is to multiply an initial parametric density estimate with a kernel-type estimate of the necessary correction factor. This works well in cases where the correction factor function is less rough than the original density itself. Extensive comparisons with the kernel estimator are carried out, including exact analysis for the class of all normal mixtures. The new method, with a normal start, wins quite often, even in many cases where the true density is far from normal. Procedures for choosing the smoothing parameter of the estimator are also discussed. The new estimator should be particularly useful in higher dimensions, where the usual nonparametric methods have problems. The idea is also spelled out for nonparametric regression.},
	author = {Hjort, N. L.  and Glad, I. K. },
	citeulike-article-id = {300166},
	editor = {Vol},
	journal = {Annals of Statistics},
	keywords = {density, nonparametric},
	number = {4},
	pages = {1619--1647},
	priority = {2},
	title = {Nonparametric density estimation with a parametric start},
	url = {http://yaroslav.hopto.org/papers/hjort-nonparametric-with-parametric.pdf},
	volume = {24},
	year = {1995}
}


@article{citeulike:297805,
	abstract = {Recent research has shown there are two types of uncertainty
that can be expressed in rst-order logic
propositional and statistical uncertaintyand that both
types can be represented in terms of probability spaces.
However, these efforts have fallen short of providing a
general account of how to design probability measures
for these spaces; as a result, we lack a crucial component
of any system that reasons under these types of uncertainty.
In this paper, we describe an automatic procedure
for dening such measures in terms of a probabilistic
knowledge base. In particular, we employ the
principle of maximum entropy to select measures that
are consistent with our knowledge and that make the
fewest assumptions in doing so. This approach yields
models of rst-order uncertainty that are principled, intuitive,
and economical in their representation.},
	author = {Paskin},
	citeulike-article-id = {297805},
	keywords = {maxent},
	priority = {2},
	title = {Maximum entropy probabilistic logic.},
	url = {http://www.stanford.edu/~paskin/pubs/csd-01-1161.pdf}
}


@inproceedings{citeulike:280566,
	abstract = {Since most real-world applications of classification learning involve continuous-valued attributes, properly addressing the discretization process is an important problem. This paper addresses the use of the entropy minimization heuristic for discretizing the range of a continuous-valued attribute into multiple intervals. We briefly present theoretical evidence for the appropriateness of this heuristic for use in the binary discretization algorithm used in ID3, C4, CART, and other learning algorithms. The results serve to justify extending the algorithm to derive multiple intervals. We formally derive a criterion based on the minimum description length principle for deciding the partitioning of intervals. We demonstrate via empirical evaluation on several real-world data sets that better decision trees are obtained using the new multi-interval algorithm.},
	author = {Fayyad and Irani},
	citeulike-article-id = {280566},
	journal = {Proceedings of the International Joint Conference on Uncertainty in AI},
	keywords = {discretization},
	pages = {1022--1027},
	priority = {2},
	title = {Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning},
	url = {http://www.cs.orst.edu/~bulatov/papers/fayyad-discretization.pdf},
	year = {1993}
}


@article{citeulike:266181,
	address = {Cambridge, MA, USA},
	author = {Geman, Stuart   and Bienenstock, Elie   and Doursat, Ren\&\#233;  },
	citeulike-article-id = {266181},
	issn = {0899-7667},
	journal = {Neural Comput.},
	month = {January},
	number = {1},
	pages = {1--58},
	priority = {2},
	publisher = {MIT Press},
	title = {Neural networks and the bias/variance dilemma},
	url = {http://portal.acm.org/citation.cfm?id=148062},
	volume = {4},
	year = {1992}
}


@article{citeulike:248104,
	address = {Essex, UK},
	author = {Wellman, M. P. },
	citeulike-article-id = {248104},
	doi = {10.1016/0004-3702(90)90026-V},
	issn = {0004-3702},
	journal = {Artif. Intell.},
	keywords = {ki},
	month = {August},
	number = {3},
	pages = {257--303},
	priority = {2},
	publisher = {Elsevier Science Publishers Ltd.},
	title = {Fundamental concepts of qualitative probabilistic networks},
	url = {http://portal.acm.org/citation.cfm?id=101744},
	volume = {44},
	year = {1990}
}


@misc{citeulike:213272,
	abstract = {We study the algebraic varieties defined by the conditional independence
statements of Bayesian Networks. A complete algebraic classification is given
for Bayesian Networks on at most five random variables. Hidden variables are
related to the geometry of higher secant varieties.},
	author = {Garcia, Luis  D.  and Stillman, Michael   and Sturmfels, Bernd  },
	citeulike-article-id = {213272},
	eprint = {math/0301255},
	keywords = {geometry},
	month = {January},
	priority = {2},
	title = {Algebraic Geometry of Bayesian Networks},
	url = {http://arxiv.org/abs/math/0301255},
	year = {2003}
}


@article{citeulike:205474,
	abstract = {			We consider the problem of comparing complex hierarchical models in which the number of parameters is not clearly defined. Using an information theoretic argument we derive a measure pD for the effective number of parameters in a model as the difference between the posterior mean of the deviance and the deviance at the posterior means of the parameters of interest. In general pD approximately corresponds to the trace of the product of Fisher\&\#146;s information and the posterior covariance, which in normal models is the trace of the \&\#145;hat\&\#146; matrix projecting observations onto fitted values. Its properties in exponential families are explored. The posterior mean deviance is suggested as a Bayesian measure of fit or adequacy, and the contributions of individual observations to the fit and complexity can give rise to a diagnostic plot of deviance residuals against leverages. Adding pD to the posterior mean deviance gives a deviance information criterion for comparing models, which is related to other information criteria and has an approximate decision theoretic justification. The procedure is illustrated in some examples, and comparisons are drawn with alternative Bayesian and classical proposals. Throughout it is emphasized that the quantities required are trivial to compute in a Markov chain Monte Carlo analysis.},
	author = {Spiegelhalter, S. D.  and Best, N. G.  and Carlin, B. P.  and Linde, A. V. D. },
	citeulike-article-id = {205474},
	issn = {1369-7412},
	journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
	keywords = {bayesian, model-selection},
	number = {4},
	pages = {583--639},
	priority = {2},
	title = {Bayesian measures of model complexity and fit},
	url = {http://www.ingentaconnect.com/content/bpl/rssb/2002/00000064/00000004/art00353},
	volume = {64},
	year = {2002}
}


@book{citeulike:144429,
	author = {Csisz4ar, Imre   and Shields, Paul   and Csiszar, Imre  },
	citeulike-article-id = {144429},
	comment = {explains I and f-divergence, iterative scaling, Sanov's theorem},
	howpublished = {Paperback},
	isbn = {1933019050},
	keywords = {book, information-theory},
	month = {December},
	priority = {2},
	publisher = {{Now Publishers Inc}},
	title = {Information Theory and Statistics: A Tutorial (Foundations and Trends in Communications and Information The)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/1933019050},
	year = {2004}
}


@inproceedings{citeulike:149184,
	address = {New York, NY, USA},
	author = {Kearns, Michael   and Mansour, Yishay   and Ron, Dana   and Rubinfeld, Ronitt   and Schapire, Robert  E.  and Sellie, Linda  },
	booktitle = {STOC '94: Proceedings of the twenty-sixth annual ACM symposium on Theory of computing},
	citeulike-article-id = {149184},
	doi = {10.1145/195058.195155},
	isbn = {0897916638},
	keywords = {colt},
	pages = {273--282},
	priority = {2},
	publisher = {ACM Press},
	title = {On the learnability of discrete distributions},
	url = {http://portal.acm.org/citation.cfm?id=195155},
	year = {1994}
}


@article{citeulike:180994,
	abstract = {Subjectivism has become the dominant philosophical foundation for Bayesian inference. Yet in practice, most Bayesian anlyses are performed with so-called "noninformative" priors, that is, priors constructed by some formal rule. We review the plethora of techniques for constructing such priors and discuss some of the practical and philosophical issues that arise when they are used. We give special emphasis to Jeffrey's rules and discuss the evolution of his viewpoint about the interpretation of priors, away from unique representation of ignorance toward teh nition that they should be chosen by convention. We conclude that the problems raised by the research on priors chosen by formal rules are serious and may not be dismissed lightly: When sample sizes are small (relative to the number of parameters being estimated), it is dangerous to put faith in any "default" solution; but when asymptitvs take over, Jeffrey's rules and their variants remain reasonable choices. We also provide an annotated bibliography.},
	author = {Kass and Wasserman},
	citeulike-article-id = {180994},
	comment = {hehe Yeah, I'm copying all your references;) drosen@stat.berkeley.edu

---=note-separator=---
Hey davidr, your bibliography looks interesting, what sort of research do you do? (yaroslavvb@gmail.com)},
	journal = {Journal of the American Statistical Association},
	keywords = {prior},
	priority = {3},
	title = {The selection of prior distributions by formal rules},
	url = {http://yaroslav.hopto.org/papers/kass-priors.pdf},
	year = {1996}
}


@book{citeulike:181128,
	author = {Cull, Paul   and Flahive, Mary   and Robson, Robby  },
	citeulike-article-id = {181128},
	howpublished = {Hardcover},
	isbn = {0387232338},
	keywords = {book},
	month = {April},
	priority = {2},
	publisher = {Springer},
	title = {Difference Equations : From Rabbits to Chaos (Undergraduate Texts in Mathematics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387232338},
	year = {2005}
}


@misc{citeulike:180980,
	abstract = {Model complexity is an important factor to consider when selecting among graphical models. When all variables are observed, the complexity of a model can be measured by its standard dimension, i.e. the number of independent parameters. When latent variables are present, however, the standard dimension might no longer be appropriate. Instead, an effective dimension should be used [5]. Zhang \&amp; Kocka [13] showed how to compute the effective dimensions of partially observed trees. In this paper we...},
	author = {Kocka, Tomas   and Zhang, Nevin  },
	citeulike-article-id = {180980},
	keywords = {regularization},
	priority = {2},
	title = {Effective Dimensions of Partially Observed Polytrees},
	url = {http://citeseer.ist.psu.edu/572937.html}
}


@article{citeulike:180967,
	author = {Marsaglia},
	citeulike-article-id = {180967},
	journal = {Journal of the American Statistical Association},
	keywords = {estimation},
	priority = {2},
	title = {Ratios of normal variables and ratios of sums of uniform variables},
	url = {http://yaroslav.hopto.org/papers/marsaglia-ratios-of-normals.pdf},
	year = {1965}
}


@incollection{citeulike:180965,
	abstract = {The controversy surrounding Bayesian inference
, and its acceptability as a scientific
methodology of statistical inference, has centered
on its requirement that prior information
about statistical parameters be explicitly
introduced and described in terms of a probability
distribution. (See INFERENCE, STATISTICAL
for further background on the Bayesian
approach.) A common objection is that
the seeming arbitrariness and subjectivity of
the prior distribution is at variance with the
desire that statistical inference be entirely
objective.
The logical Bayesian view holds that a
prior distribution represents partial logical information about unknown parameters, of
the same objective status as a statistical
model. In particular, it is supposed that, for
any model, there is a specific prior distribution
representing "complete ignorance." The
program of determining such ignorance
priors has been presented most cogently by
Jeffreys},
	author = {Dawid},
	citeulike-article-id = {180965},
	journal = {Encyclopedia of Statistical Sciences},
	keywords = {bayesian, prior},
	priority = {3},
	title = {Invariant Prior Distributions},
	url = {http://yaroslav.hopto.org/papers/dawid-invariant-priors.pdf},
	year = {1983}
}


@book{citeulike:150582,
	author = {Adler, Robert  J. },
	citeulike-article-id = {150582},
	howpublished = {Hardcover},
	isbn = {0471278440},
	keywords = {crf, geometry},
	month = {June},
	priority = {2},
	publisher = {{John Wiley \& Sons Inc}},
	title = {The Geometry of Random Fields (Probability \& Mathematical Statistics S.)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0471278440},
	year = {1981}
}


@inproceedings{Altun04,
	address = {Arlington, Virginia},
	author = {Yasemin, Altun   and Alex, Smola   and Thomas, Hofmann  },
	booktitle = {Proceedings of the 20th Annual Conference on Uncertainty in Artificial Intelligence (UAI-04)},
	citeulike-article-id = {150250},
	keywords = {crf, svm},
	pages = {2--9},
	priority = {2},
	publisher = {AUAI Press},
	title = {Exponential Families for Conditional Random Fields},
	url = {http://www.cs.brown.edu/people/altun/pubs/AltSmoHof-UAI04.ps},
	year = {2004}
}


@article{citeulike:150249,
	author = {Rennie},
	citeulike-article-id = {150249},
	keywords = {svm},
	priority = {2},
	title = {Ordinal Logistic Regression},
	url = {http://people.csail.mit.edu/u/j/jrennie/public\_html/writing/olr.pdf}
}


@article{citeulike:134224,
	abstract = {Finding a set of moves that don't affect the marginals of the contingency table},
	citeulike-article-id = {134224},
	keywords = {bayesnet},
	priority = {2},
	title = {Markov Bases of Binary Graph Models},
	url = {http://arxiv.org/PS\_cache/math/pdf/0308/0308280.pdf}
}


@book{citeulike:134203,
	abstract = {{Machine Learning is a powerful new field with many important practical applications. It has recently matured from a black art into a principled science with a strong mathematical and statistical foundation. Thanks to the information age and flood of data, it has also taken many domains by storm including biology, text processing, internet data organization, computer vision, speech recognition, computer-human interfaces, robotics and artificial intelligence. Engineers and companies are looking to these technologies to gain a competitive edge. From the smallest startups that are using support vector machines for web page classification, to biotech firms that are doing drug discovery and large corporations that are building learning into database systems, the tools of this field are proliferating.  <P>Machine Learning: Discriminative and Generative covers the main contemporary themes and tools in machine learning ranging from Bayesian probabilistic models to discriminative support-vector machines. However, unlike previous books that only discuss these rather different approaches in isolation, it bridges the two schools of thought together within a common framework, elegantly connecting their various theories and making one common big-picture. Also, this bridge brings forth new hybrid discriminative-generative tools that combine the strengths of both camps. This book serves multiple purposes as well. The framework acts as a scientific breakthrough, fusing the areas of generative and discriminative learning and will be of interest to many researchers. However, as a conceptual breakthrough, this common framework unifies many previously unrelated tools and techniques and makes them understandable to a larger portion of the public. This gives the more practical-minded engineer, student and the industrial public an easy-access and more sensible road map into the world of machine learning.  Machine Learning: Discriminative and Generative is designed for an audience composed of researchers \& practitioners in industry and academia. The book is also suitable as a secondary text for graduate-level students in computer science and engineering.}},
	author = {Jebara, Tony  },
	citeulike-article-id = {134203},
	comment = {- maximum entropy discriminative as unification of discriminative and generative approaches},
	howpublished = {Hardcover},
	isbn = {1402076479},
	keywords = {book, generative-discriminative, svm},
	month = {December},
	priority = {2},
	publisher = {Springer},
	title = {Machine Learning : Discriminative and Generative (The Kluwer International Series in Engineering and Computer Science)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/1402076479},
	year = {2003}
}


@unpublished{citeulike:134020,
	author = {Rusakov, Dmitry  },
	citeulike-article-id = {134020},
	keywords = {bayesian, model-selection, regularization},
	priority = {2},
	title = {A Tutorial on Schwarz' 1978 paper},
	url = {http://www.cs.technion.ac.il/~rusakov/archive/texts/schwarz\_tutorial.ps},
	year = {2002}
}


@article{citeulike:127193,
	author = {Heckerman, David  },
	citeulike-article-id = {127193},
	isbn = {0262600323},
	keywords = {bayesian, graphical},
	pages = {301--354},
	priority = {2},
	publisher = {MIT Press},
	title = {A tutorial on learning with Bayesian networks},
	url = {http://portal.acm.org/citation.cfm?id=308574.308676},
	year = {1999}
}


@article{citeulike:126999,
	abstract = {The use of the principle of minimum information, or equivalently the
principle of maximum entropy, has been advocated by a number of authors
over recent years both in statistical physics as well as more generally in
statistical inference.i It has perhaps not been suffciently appreciated by
philosophers, however, that this principle, when properly understood,
affords a rule of inductive inference of the widest generality. 2 The purpose
of this paper is to draw attention to the generality of the principle. Thus
the Bayesian rule of conditionalisation, as well as its extension by R. C.
Jeffrey, will be exhibited as special cases. General conditions under which
it yields a unique prescription will also be studied. Detailed treatment wil
be restricted to the finite-dimensional case but an outline of the general
case is given in the Appendix.},
	author = {Williams, P. },
	citeulike-article-id = {126999},
	journal = {British Journal of Philosophy of Science},
	keywords = {bayesian, maxent, philosophy},
	pages = {131--144},
	priority = {2},
	title = {Bayesian Conditionalisation and the Principle of Minimum Information},
	url = {http://www.cs.orst.edu/~bulatov/papers/williams-conditionalization.pdf},
	volume = {31},
	year = {1980}
}


@article{citeulike:126994,
	abstract = {A bound on the uncertainty of the Lagrange multiplier due to experimental scatter in the input frequencies is provided.
An inequalty relating experimental and inherent uncertaities is derived. The product of the inherent uncertaities of a
constraint and its conjugate variable is shown to have a minimal value of unity.},
	author = {Alhassid and Levine},
	citeulike-article-id = {126994},
	comment = {The experimental determination of the frequencies
of different outcomes is necessarily subjected to some
scatter. In the maximum entropy formalism (1,2) one
attempts to fit the measured frequencies by a theoretical
distribution computed by the maximum entropy
(subject to constraints) procedure. The quality of
the fit can only be improved by including additional
constraints. There may come a point however where
the addition of further constraints serves only to fit
the noise. Given the magnitude of the relative error
in the individual frequencies, it was previously shown
possible to identify that point.},
	journal = {Chemical Physics Letters},
	keywords = {maxent, physics},
	number = {1},
	pages = {16--20},
	priority = {2},
	title = {Experimental and inherent uncertainties in the information theoretic approach},
	volume = {73},
	year = {1980}
}


@article{citeulike:126524,
	author = {Skyrms},
	citeulike-article-id = {126524},
	comment = {- Views MaxEnt as a principle of stochastic "hypothesizing" as opposed to a rule of inducing probabilities.
- Bayesian updating satisfies "dynamic coherence" (Dutch book) requirements
- MaxEnt inference doesn't satisfy dynamic coherence, except for deterministic prior.
- Shows that MaxEnt updates satisfies the conditions needed for Stalnaker selection function for hypothesizing},
	journal = {Theory and Decision},
	keywords = {bayesian, maxent, philosophy},
	number = {3},
	priority = {0},
	title = {Updating, supposing, and maxent},
	url = {http://yaroslav.hopto.org/papers/skyrms-updating.pdf },
	volume = {22},
	year = {1987}
}


@techreport{citeulike:126523,
	abstract = {We provide a classification of graphical models according to their representation as exponential families. Undirected graphical models with no hidden variables are linear exponential families (LEFs), directed acyclic graphical (DAG) models and chain graphs with no hidden variables, including DAG models with several families of local distributions, are curved exponential families (CEFs) and graphical models with hidden variables are stratified exponential families (SEFs). A SEF is a finite union of CEFs of various dimensions satisfying some regularity conditions. The main results of this paper are that graphical models are SEFs and that many graphical models are not CEFs. That is, roughly speaking, graphical models when viewed as exponential families correspond to a set of smooth manifolds of various dimensions and usually not to a single smooth manifold. These results are discussed in the context of model selection.},
	author = {Geiger and Heckerman and King and Meek},
	citeulike-article-id = {126523},
	comment = {Gives definitions of various exponential families.},
	journal = {Microsoft Research},
	keywords = {graphical, information-geometry},
	month = {July},
	priority = {2},
	title = {Stratified Exponential Families: Graphical Models and Model Selection},
	url = {http://research.microsoft.com/research/pubs/view.aspx?msr\_tr\_id=MSR-TR-98-31},
	volume = {MSR-TR-98-31},
	year = {1998}
}


@article{citeulike:126514,
	author = {Herbrich, Ralf   and Graepel, Thore   and Campbell, Colin  },
	citeulike-article-id = {126514},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {bayesian, svm},
	pages = {245--279},
	priority = {2},
	publisher = {MIT Press},
	title = {Bayes point machines},
	url = {http://portal.acm.org/citation.cfm?id=944733.944742},
	volume = {1},
	year = {2001}
}


@article{citeulike:126492,
	abstract = {A consistent approach to the inference of a probability distribution given a limited number of expectation values of relevant variables is discussed. There are two key assumptions: that the experiment can be independently repeated a finite number (not necessarily large) of times and that the theoretical expectation values of the relevant observables are to be estimated from their measured sample averages. Three independent but complementary routes for deriving the form of the distribution from these two assumptions are reviewed. All three lead to a unique distribution which is identical with the one obtained by the maximum-entropy formalism. The present derivation thus provides an alternative approach to the inference problem which does not invoke Shannon's notion of missing information or entropy. The approach is more limited in scope than the one proposed by Jaynes, but has the advantage that it is objective and that the operational origin of the "given" expectation values is specified.},
	author = {Tikochinsky and Tishby and Levine},
	citeulike-article-id = {126492},
	comment = {Gives 3 alternative justification of MaxEnt principle

- Something about consistency across repeatable experiments
- MaxEnt choice procedure gives smallest variance out of all procedures choosing a distribution consistent with constraints (special case of Rao-Cramer inequality)
- MaxEnt solution comes out if we treat our vector of sample averages as sufficient statistic and apply Koopman-Pitman theorem (derives Koopman-Pitman theorem!)},
	journal = {Physical Review Letters A},
	keywords = {maxent},
	pages = {2638--2644},
	priority = {0},
	title = {Alternative approach to maximum-entropy inference},
	url = {http://yaroslav.hopto.org/papers/tishby-alternative-maxent.pdf},
	volume = {30},
	year = {1984}
}


@article{citeulike:126458,
	abstract = {Some geometric properties of PD's are established, Kullback's I-divergence playing the role of squared Euclidean distance. The minimum discrimination information problem is viewed as that of projecting a PD onto a convex set of PD's and useful existence theorems for and characterizations of the minimizing PD are arrived at. A natural generalization of known iterative algorithms converging to the minimizing PD in special situations is given; even for those special cases, our convergence proof is more generally valid than those previously published. As corollaries of independent interest, generalizations of known results on the existence of PD's or nonnegative matrices of a certain form are obtained. The Lagrange multiplier technique is not used.},
	author = {Csiszar, I. },
	citeulike-article-id = {126458},
	comment = {- Gives conditions on the constraint set that guarantee MaxEnt solution},
	journal = {The Annals of Probability},
	keywords = {information-geometry, maxent},
	number = {1},
	pages = {146--158},
	priority = {2},
	title = {I-Divergence Geometry of Probability Distributions and Minimization Problems},
	url = {http://links.jstor.org/sici?sici=0091-1798\%28197502\%293\%3A1\%3C146\%3AGOPDAM\%3E2.0.CO\%3B2-R},
	volume = {3},
	year = {1975}
}


@incollection{citeulike:125896,
	abstract = {The principles of Bayesian reasoning are reviewed and applied to problems of inference from data sampled from Poisson, Gaussian an dCauchy distributions. Probability distributions (prios and likelihoods) are assigned in appripirate hypothesis spaces using the Maximum Entropy Principle, and then manipulated via Bayes' Theorem. Bayesian hypothesis testing requires careful consideration of the prior ranges of any parameters involved, and this leads to a quantitive statement of Occam's Razor. As an example  of this general principle we offer a solution to an important problem in regression analysis; determining the optimal number of parameters to use when fitting graphical data with a set of basis functions.},
	author = {Gull, S. },
	citeulike-article-id = {125896},
	journal = {Maximum Entropy and Bayesian Methods in Science and Engineering, Volume 1: Foundations},
	keywords = {bayesian, maxent},
	priority = {2},
	title = {Bayesian inductive inference and maximum entropy.},
	year = {1988}
}


@article{citeulike:122857,
	abstract = {This paper suggests estimators of the frequencies or proportions 
of N distinguishable objects contained in categories, given various types of information.

We consider information in the form of exact constraints on the N,
sample frequencies, and frequencies of related data. The analysis uses Bayesian
methods, where the prior distribution is assumed to be a function of the cross-entropy between the N. and a reference distribution. We show the relationship
between our estimator and the log-linear and logit models and also present a
sampling experiment to compare our proposed estimator with the iterated proportional fitting estimator.},
	author = {Denzau and Gibbons and Greenberg},
	citeulike-article-id = {122857},
	journal = {Communications in Statistics-Theory and Methods},
	keywords = {maxent},
	number = {5},
	pages = {1843--1861},
	priority = {2},
	title = {Bayesian Estimation of Proportions with a Cross-Entropy Prior},
	volume = {18},
	year = {1989}
}


@book{citeulike:122422,
	abstract = {{Bayes or Bust? provides the first balanced treatment of the complex set of issues involved in this nagging conundrum in the philosophy of science.}},
	author = {Earman, John  },
	citeulike-article-id = {122422},
	howpublished = {Hardcover},
	isbn = {0262050463},
	keywords = {bayesian, philosophy},
	month = {May},
	priority = {2},
	publisher = {{The MIT Press}},
	title = {Bayes or Bust? A Critical Examination of Bayesian Confirmation Theory},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0262050463},
	year = {1992}
}


@book{citeulike:120816,
	abstract = {{Differential geometry provides an aesthetically appealing and often revealing view of statistical inference. Beginning with an elementary treatment of one-parameter statistical models and ending with an overview of recent developments, this is the first book to provide an introduction to the subject that is largely accessible to readers not already familiar with differential geometry. It also gives a streamlined entry into the field to readers with richer mathematical backgrounds. Much space is devoted to curved exponential families, which are of interest not only because they may be studied geometrically but also because they are analytically convenient, so that results may be derived rigorously. In addition, several appendices provide useful mathematical material on basic concepts in differential geometry. Topics covered include the following: <ul><li>Basic properties of curved exponential families <li>Elements of second-order, asymptotic theory <li>The Fisher-Efron-Amari theory of information loss and recovery <li>Jeffreys-Rao information-metric Riemannian geometry <li>Curvature measures of nonlinearity <li>Geometrically motivated diagnostics for exponential family regression <li>Geometrical theory of divergence functions <li>A classification of and introduction to additional work in the field</ul>    }},
	author = {Kass, Robert  E.  and Vos, Paul  W. },
	citeulike-article-id = {120816},
	howpublished = {Hardcover},
	isbn = {0471826685},
	keywords = {information-geometry},
	month = {July},
	priority = {2},
	publisher = {Wiley-Interscience},
	title = {Geometrical Foundations of Asymptotic Inference},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0471826685},
	year = {1997}
}


@book{citeulike:120454,
	author = {Jeffreys, H. },
	citeulike-article-id = {120454},
	comment = {- Sorted algebraic models in order of simplicity (linear first, then quadratic).},
	howpublished = {Hardcover},
	isbn = {0521054257},
	keywords = {bayesian, philosophy},
	month = {December},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {Scientific Inference},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521054257},
	year = {1957}
}


@article{citeulike:120256,
	abstract = {In the third chapter of his book Theory and Evidence, Clark Glymour
explains why he is not a Bayesian. I shall attempt to show, on the contrary,
that he is a Bayesian, more so than many who march under that banner.},
	address = {Minneapolis},
	author = {Rosenkrantz, R. },
	citeulike-article-id = {120256},
	editor = {Earman, J. },
	journal = {Testing Scientific Theories},
	keywords = {bayesian, philosophy},
	priority = {2},
	publisher = {University of Minnesota Press},
	title = {Why Glymour IS a Bayesian},
	url = {http://www.cs.orst.edu/~bulatov/papers/glymour\_is\_a\_bayesian.pdf},
	volume = {10},
	year = {1983}
}


@inproceedings{citeulike:120140,
	abstract = {Maximum entropy is presented as a universal method of finding a "best" positive distribution constrained by incomplete data. The generalised entropy Sum (f - m - f log (f/m) ) ) is the only form which selects acceptable distributions f in particular cases. It holds even if f is not normalised, so that maximum entropy applies directly to physical distributions other than probabilities. Furthermore, maximum entropy should also be used to select "best" parameters if the underlying model m has such freedom.},
	author = {Skilling, J. },
	citeulike-article-id = {120140},
	editor = {Erickson and Smith},
	journal = {Maximum-Entropy and Bayesian Methods in Science and Engineering},
	keywords = {maxent, philosophy},
	priority = {2},
	publisher = {Kluwer},
	title = {The Axioms of Maximum Entropy},
	url = {http://www.cs.orst.edu/~bulatov/papers/skilling-axioms.pdf},
	year = {1988}
}


@book{citeulike:120124,
	abstract = {{In Algebra of Probable Inference, Richard T. Cox develops and demonstrates that probability theory is the only theory of inductive inference that abides by logical consistency. Cox does so through a functional derivation of probability theory as the unique extension of Boolean Algebra thereby establishing, for the first time, the legitimacy of probability theory as formalized by Laplace in the 18th century.  <P>Perhaps the most significant consequence of Cox's work is that probability represents a subjective degree of plausible belief relative to a particular system but is a theory that applies universally and objectively across any system making inferences based on an incomplete state of knowledge.  Cox goes well beyond this amazing conceptual advancement, however, and begins to formulate a theory of logical questions through his consideration of systems of assertions\&\#151;a theory that he more fully developed some years later.  Although Cox's contributions to probability are acknowledged and have recently gained worldwide recognition, the significance of his work regarding logical questions is virtually unknown. The contributions of Richard Cox to logic and inductive reasoning may eventually be seen to be the most significant since Aristotle.}},
	author = {Cox, Richard  T. },
	citeulike-article-id = {120124},
	howpublished = {Paperback},
	isbn = {080186982X},
	keywords = {bayesian, philosophy},
	month = {February},
	priority = {2},
	publisher = {{Johns Hopkins University Press}},
	title = {Algebra of Probable Inference},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/080186982X},
	year = {2002}
}


@article{citeulike:120108,
	abstract = {The aim of confimation theory is to provide a true account of the principles
that guide scientific argument in so far as that argument is not, and
does not purport to be, of a deductive kind. A confimation theory should
serve as a critical and explanatory instrument quite as much as do theories
of deductive inference. Any successful confiation theory should, for
example, reveal the structure and fallacies, if any, in Newton s argument
for universal gravitation, in nineteenth-century arguments for and against
the atomic theory, in Freud's arguments for psychoanalytic generalizations.
Where scientific judgements are widely shared, and sociological
factors cannot explain their ubiquity, and analysis through the lens provided
by confirmation theory reveals no good explicit arguments for the
judgements, confirmation theory ought at least sometimes to suggest some)
good arguments that may have been lurking misperceived. Theories of
deductive inference do that much for scientific reasoning in so far as that
reasoning is supposed to be demonstrative. We can apply quantification
theory to assess the validity of scientific arguments, and although we must
almost always treat such arguments as enthymematic, the premisses we
interpolate are not arbitrary; in many cases, as' when the same subjectmatter
is under discussion , there is a common set of suppressed premisses.
Again, there may be differences about the correct logical form of scientific
claims; differences of this kind result in (or from) different formalizations
for example, of classical mechanics. But such differences often make no
difference for the assessment of validity in actual arguments. Confrmation
theory should do as well in its own domain. If it fails , then it may stil be of
interest for many purposes, but not for the purpose of understanding
scientific reasoning.
The aim of confimation theory ought not to be simply to provide
precise replacements for informal methodological notions, that is, expli-},
	author = {Glymour, Clark  },
	citeulike-article-id = {120108},
	journal = {Theory and evidence},
	keywords = {bayesian, philosophy},
	pages = {63--93},
	priority = {2},
	publisher = {University of Chicago Press},
	title = {Why I am not a Bayesian},
	url = {http://yaroslav.hopto.org/papers/sfx1ba.pdf},
	year = {1981}
}


@techreport{citeulike:119506,
	abstract = {A key task in the elicitation of expert knowledge is to construct a specific
elicited distribution from the finite, and usually small, number of statements
that the have been elicited from the expert. These statements typically specify
some quantiles of the distribution, perhaps the mode and sometimes the
mean or other moments. Such statements are not enough to identify the expert’s
probability distribution uniquely, and the usual approach is to fit some
member of a convenient parametric family. There are two clear deficiencies
in this solution. First, the expert’s beliefs are forced to fit the parametric
family. Second, no account is then taken of the many other possible distributions
that might have fitted the elicited statements equally well. We present
an approach which tackles both of these deficiencies. Our model is nonparametric,
allowing the expert’s distribution to take any continuous form. It also
quantifies the uncertainty in the resulting elicited distribution. Formally, the
expert’s density function is treated as an unknown function, about which we
make inference. The result is a posterior distribution for the expert’s density
function. The posterior mean serves as a ‘best fit’ elicited distribution, while
1
the variance around this fit expresses the uncertainty in the elicitation. Two
illustrations of our method are given using test examples.
KEY WORDS: Expert elicitation, Gaussian process, non-parametric density
estimation.},
	author = {Oakley, Jeremy   and O’hagan, Anthony  },
	citeulike-article-id = {119506},
	comment = {- Defines a prior over densities (Gaussian Process) that an expert may have, uses Bayesian Approach (Posterior Mean) to find best estimate},
	journal = {Department of Probability and Statistics, University of Sheffield.},
	keywords = {bayesian, elicitation},
	priority = {2},
	title = {Uncertainty in Prior Elicitations: a Nonparametric Approach},
	url = {http://lib.stat.cmu.edu/DOS/general/first-bayes/pdf/elicitation3.pdf},
	year = {2002}
}


@article{citeulike:117646,
	abstract = {Elicitation of expert opinion is becoming increasingly important in the elicitation of prior distributions. In this paper, the psychology of elicitation and the currently available methods are briefly reviewed, but the primary discussion is on the distinction between `general'elicitation methods for a class of problems and `application-specific' methods which are useful only once. Examples of both types of elicitation are given, along with a discussion about general versus application-specific methods, and predictive versus structural elicitation},
	author = {Kadane, J.  and Wolfson, L. J. },
	citeulike-article-id = {117646},
	issn = {0039-0526},
	journal = {Journal of the Royal Statistical Society: Series D (The Statistican)},
	keywords = {bayesian, elicitation},
	month = {March},
	number = {1},
	pages = {3--19},
	priority = {2},
	title = {Experiences in elicitation.},
	url = {http://www.ingentaconnect.com/content/bpl/rssd/1998/00000047/00000001/art00113},
	volume = {47},
	year = {1998}
}


@misc{citeulike:117639,
	abstract = {We provide a polyhedral description of the conditions for the existence of
the maximum likelihood estimate (MLE) for a hierarchical log-linear model. The
MLE exists if and only if the observed margins lie in the relative interior of
the marginal cone. Using this description, we give an algorithm for determining
if the MLE exists. If the tree width is bounded, the algorithm runs in
polynomial time. We also perform a computational study of the case of three
random variables under the no three-factor effect model.},
	author = {Eriksson, Nicholas   and Fienberg, Stephen  E.  and Rinaldo, Alessandro   and Sullivant, Seth  },
	citeulike-article-id = {117639},
	comment = {- MLE doesn't exist when the supremum of likelihood over the domain is greater than any point in the domain.
- Only happens for open parameter sets.
- Contingency tables with 0's cause non-existing MLE's (because lambda goes to infinity)
- Do MLE's always exist if we include -infinity's in our natural parameter space?},
	eprint = {math.CO/0405044},
	keywords = {statistics},
	month = {May},
	priority = {2},
	title = {Polyhedral conditions for the nonexistence of the MLE for hierarchical log-linear models},
	url = {http://arxiv.org/abs/math.CO/0405044},
	year = {2004}
}


@article{citeulike:116263,
	abstract = {1 Introduction 2 
2 Glivenko­Cantelli Classes 5 
2.1 The classical approach . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5 
2.1.1 The symmetrization procedure . . . . . . . . . . . . . . . . . . . . . 7 
2.1.2 Covering numbers and complexity estimates . . . . . . . . . . . . . . 9 
2.2 Combinatorial parameters and covering numbers . . . . . . . . . . . . . . . 12 
2.2.1 Uniform entropy and the VC dimension . . . . . . . . . . . . . . . . 13 
2.2.2 Generalized combinatorial parameters . . . . . . . . . . . . . . . . . 16 
2.3 Talagrand's inequality . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18 
2.4 Random averages, combinatorial parameters and covering numbers . . . . . 21 
2.4.1 Structural results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21 
2.4.2 Example: Kernel Classes . . . . . . . . . . . . . . . . . . . . . . . . 23 
2.4.3 Entropy and averages . . . . . . . . . . . . . . . . . . . . . . . . . . 25 
3 Learning sample complexity 29 
3.1 Localized random averages . . . . . . . . . . . . . . . . . . . . . . . . . . . . 32 
3.1.1 Localized averages of kernel classes . . . . . . . . . . . . . . . . . . . 32 
3.1.2 Using the Entropy . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33 
3.2 The iterative scheme . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 37 
A Concentration of measure and Rademacher averages 38},
	author = {Mendelson, Shahar  },
	citeulike-article-id = {116263},
	comment = {Canberra summer school lecture notes},
	isbn = {3540005293},
	keywords = {regularization, statistical-learning-theory},
	pages = {1--40},
	priority = {2},
	publisher = {Springer-Verlag New York, Inc.},
	title = {A few notes on statistical learning theory},
	url = {http://portal.acm.org/citation.cfm?id=863716},
	year = {2003}
}


@book{citeulike:117058,
	author = {Rosenkrantz, Roger  D. },
	citeulike-article-id = {117058},
	howpublished = {{Unknown Binding}},
	isbn = {9027708177},
	keywords = {bayesian, book, philosophy},
	month = {September},
	priority = {2},
	publisher = {{D. Reidel Pub. Co}},
	title = {Inference, method, and decision: Towards a Bayesian philosophy of science (Synthese library ; v. 115)},
	url = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&amp;path=ASIN/9027708177},
	year = {1977}
}


@article{citeulike:117010,
	abstract = {We study the geometry of the parameter space for Bayesian directed graphical models with hidden variables that have a tree structure and where all the nodes are binary.We show that the conditional independence statements implicit in such models can be expressed in terms of polynomial relationships among the central moments.This algebraic structure will enable us to identify the inequality constraints on the space of the manifest variables that are induced by the conditional independence assumptions as well as determine the degree of unidentifiability of the parameters associated with the hidden variables. By understanding the geometry of the sample space under this class of models we shall propose and discuss simple diagnostic methods.},
	author = {Settimi and Smith},
	citeulike-article-id = {117010},
	journal = {Annals of Statistics},
	keywords = {bayesnet, geometry},
	number = {4},
	priority = {2},
	title = {Geometry, moments and conditional independence trees with hidden variables},
	url = {http://projecteuclid.org/Dienst/UI/1.0/Summarize/euclid.aos/1015956712},
	volume = {28},
	year = {2000}
}


@article{citeulike:116762,
	abstract = {First, a snapshot is provided of the current state of Bayesian statistics. Included are entry points to the Bayesian literatures in various disciplines and various areas of statis- tics. Next, the status of the various approaches to Bayesian analysis are discussed; these approaches are termed the objective, subjective, robust, frequentist-Bayes, and quasi-Bayes approaches. Speculations about the future are sprinkled throughout this latter material. Finally, comments about computation and existing and future software are given.},
	author = {Berger},
	citeulike-article-id = {116762},
	comment = {Compares subjective, objective, robust and quasi-Bayesian approaches. Predicts what the future is going to be (mix of Bayesianism and frequentism)},
	journal = {Journal of the American Statistical Association},
	keywords = {bayesian, philosophy},
	priority = {0},
	title = {Bayesian analysis: a look at today and thoughts of tomorrow},
	url = {http://scholar.google.com/url?sa=U\&\#38;q=http://www.isds.duke.edu/~berger/papers/99-30.ps},
	year = {2000}
}


@article{Efron:1986:WIE,
	author = {Efron, B. },
	citeulike-article-id = {116410},
	comment = {Efron's attempt to answer why not everyone is a Bayesian, as of 1985. Main reasons

- non-Bayesian approaches easier to use
- more computationally feasible
- don't have to rely on controversial subjective priors},
	journal = {The American Statistician},
	keywords = {bayesian, philosophy},
	month = {????},
	number = {1},
	pages = {1--11},
	priority = {0},
	title = {Why isn't everyone a {Bayesian}? With discussion and a reply by the author},
	url = {http://web.engr.oregonstate.edu/~bulatov/papers/sfx19ad.pdf},
	volume = {40},
	year = {1986}
}


@article{citeulike:116409,
	abstract = {Graphical Markov models use undirected graphs (UDGs), acyclic directed graphs (ADGs), or (mixed) chain graphs to represent possible dependencies among random variables in a multivariate distribution. Whereas a UDG is uniquely determined by its associated Markov model, this is not true for ADGs or for general chain graphs (which include both UDGs and ADGs as special cases). This paper addresses three questions regarding the equivalence of graphical Markov models: when is a given chain graph Markov equivalent (1) to some UDG? (2) to some (at least one) ADG? (3) to some decomposable UDG? The answers are obtained by means of an extension of Frydenberg's (1990) elegant graph-theoretic characterization of the Markov equivalence of chain graphs.},
	author = {Andersson, S. A.  and Madigan, D.  and Perlman, M. D. },
	citeulike-article-id = {116409},
	comment = {Gives a criterion for a chain graph to be equivalent to some undirected graph, or to some acyclic digraph},
	issn = {0303-6898},
	journal = {Scandinavian Journal of Statistics},
	keywords = {models},
	month = {March},
	number = {1},
	pages = {81--102},
	priority = {2},
	title = {On the Markov Equivalence of Chain Graphs, Undirected Graphs, and Acyclic Digraphs},
	url = {http://www.ingentaconnect.com/content/bpl/sjos/1997/00000024/00000001/art00050},
	volume = {24},
	year = {1997}
}


@article{citeulike:115169,
	abstract = {This paper reviews the Bayesian approach to model selection and model averaging. In this review, I emphasize objective Bayesian methods based on noninformative priors. I will also discuss implementation details, approximations, and relationships to other methods. Copyright 2000 Academic Press.},
	address = {Carnegie Mellon University},
	author = {Wasserman, L. },
	citeulike-article-id = {115169},
	comment = {- objective Bayesian proponent
- Explains BIC as approximate log Bayes Factor.
- Explanations of AIC, Jeffrey's prior},
	doi = {10.1006/jmps.1999.1278},
	issn = {0022-2496},
	journal = {J Math Psychol},
	keywords = {bayesian, model-selection},
	month = {March},
	number = {1},
	pages = {92--107},
	priority = {3},
	title = {Bayesian Model Selection and Model Averaging.},
	url = {http://dx.doi.org/10.1006/jmps.1999.1278},
	volume = {44},
	year = {2000}
}


@inproceedings{citeulike:115122,
	abstract = {We present a diagrammatic formalism and practial methods for introducing
additional independence assumptions into parameter estimation,
enabling efficient training of undirected graphical models in
locally-normalized pieces. On two real-world data sets we demonstrate
our locally-trained linear-chain CRFs outperforming traditional CRFs—
training in less than one-fifth the time, and providing a statisticallysignificant
gain in accuracy.},
	author = {Mccallum, Andrew   and Sutton, Charles  },
	citeulike-article-id = {115122},
	journal = {NIPS 2004 workshop on sturctured learning},
	keywords = {crf, graphical, mlrg},
	priority = {2},
	title = {Piecewise Training with
Parameter Independence Diagrams:
Comparing Globally- and Locally-trained
Linear-chain CRFs},
	url = {http://www.cs.umass.edu/~mccallum/papers/lcrf-nips2004.pdf},
	year = {2004}
}


@book{citeulike:115106,
	abstract = {{The aim of this book is to discuss the fundamental ideas which lie behind the statistical theory of learning and generalization. It considers learning as a general problem of function estimation based on empirical data. Omitting proofs and technical details, the author concentrates on discussing the main results of learning theory and their connections to fundamental problems in statistics. These include: * the setting of learning problems based on the model of minimizing the risk functional from empirical data * a comprehensive analysis of the empirical risk minimization principle including necessary and sufficient conditions for its consistency * non-asymptotic bounds for the risk achieved using the empirical risk minimization principle * principles for controlling the generalization ability of learning machines using small sample sizes based on these bounds * the Support Vector methods that control the generalization ability when estimating function using small sample size. The second edition of the book contains three new chapters devoted to further development of the learning theory and SVM techniques. These include: * the theory of direct method of learning based on solving multidimensional integral equations for density, conditional probability, and conditional density estimation * a new inductive principle of learning. Written in a readable and concise style, the book is intended for statisticians, mathematicians, physicists, and computer scientists. Vladimir N. Vapnik is Technology Leader AT\&T Labs-Research and Professor of London University. He is one of the founders of statistical learning theory, and the author of seven books published in English, Russian, German, and Chinese.}},
	author = {Vapnik, Vladimir  N. },
	citeulike-article-id = {115106},
	howpublished = {Hardcover},
	isbn = {0387987800},
	keywords = {book, statistical-learning-theory, svm},
	month = {November},
	priority = {2},
	publisher = {Springer},
	title = {The Nature of Statistical Learning Theory (Information Science and Statistics)},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387987800},
	year = {1999}
}


@book{citeulike:115039,
	abstract = {{Uncertainty is a fundamental and unavoidable feature of daily life; in order to deal with uncertaintly intelligently, we need to be able to represent it and reason about it. In this book, Joseph Halpern examines formal ways of representing uncertainty and considers various logics for reasoning about it. While the ideas presented are formalized in terms of definitions and theorems, the emphasis is on the philosophy of representing and reasoning about uncertainty; the material is accessible and relevant to researchers and students in many fields, including computer science, artificial intelligence, economics (particularly game theory), mathematics, philosophy, and statistics.<br /> <br /> Halpern begins by surveying possible formal systems for representing uncertainty, including probability measures, possibility measures, and plausibility measures. He considers the updating of beliefs based on changing information and the relation to Bayes' theorem; this leads to a discussion of qualitative, quantitative, and plausibilistic Bayesian networks. He considers not only the uncertainty of a single agent but also uncertainty in a multi-agent framework. Halpern then considers the formal logical systems for reasoning about uncertainty. He discusses knowledge and belief; default reasoning and the semantics of default; reasoning about counterfactuals, and combining probability and counterfactuals; belief revision; first-order modal logic; and statistics and beliefs. He includes a series of exercises at the end of each chapter.}},
	author = {Halpern, Joseph  Y. },
	citeulike-article-id = {115039},
	howpublished = {Hardcover},
	isbn = {0262083205},
	keywords = {book, ki, philosophy},
	month = {October},
	priority = {2},
	publisher = {{The MIT Press}},
	title = {Reasoning about Uncertainty},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0262083205},
	year = {2003}
}


@book{citeulike:114719,
	abstract = {{This is the first comprehensive introduction to Support Vector Machines (SVMs), a new generation learning system based on recent advances in statistical learning theory. Students will find the book both stimulating and accessible, while practitioners will be guided smoothly through the material required for a good grasp of the theory and its applications. The concepts are introduced gradually in accessible and self-contained stages, while the presentation is rigorous and thorough. Pointers to relevant literature and web sites containing software make it an ideal starting point for further study.}},
	author = {Cristianini, Nello   and Shawe-Taylor, John  },
	citeulike-article-id = {114719},
	howpublished = {Hardcover},
	isbn = {0521780195},
	keywords = {book, regularization, statistical-learning-theory, svm},
	month = {March},
	priority = {2},
	publisher = {{Cambridge University Press}},
	title = {An Introduction to Support Vector Machines and Other Kernel-based Learning Methods},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0521780195},
	year = {2000}
}


@book{citeulike:114715,
	abstract = {{This book is unique in that it covers the philosophy of model-based data analysis and an omnibus strategy for the analysis of empirical data. The book introduces information theoretic approaches and focuses critical attention on a priori modeling and the selection of a good approximating model that best represents the inference supported by the data. Kullback-Leibler information represents a fundamental quantity in science and is Hirotugu Akaike's basis for model selection. The maximized log-likelihood function can be bias-corrected to provide an estimate of expected, relative Kullback-Leibler information. This leads to Akaike's Information Criterion (AIC) and various extensions and these are relatively simple and easy to use in practice, but little taught in statistics classes and far less understood in the applied sciences than should be the case. The information theoretic approaches provide a unified and rigorous theory, an extension of likelihood theory, an important application of information theory, and are objective and practical to employ across a very wide class of empirical problems. Parameter estimation has long been viewed as an optimization problem (e.g., maximize the log-likelihood or minimize the residual sum of squared deviations) and under the information theoretic paradigm, data-based model selection is also an optimization problem. This brings model selection and parameter estimation under a common framework - optimization. The value of AIC is computed for each a priori model to be considered and the model with the minimum AIC is used for statistical inference. However, the paradigm described in this book goes beyond merely the computation and interpretation of AIC to select a parsimonious model for inference from empirical data; it refocuses increased attention on a variety of considerations and modeling prior to the actual analysis of data.}},
	author = {Burnham, Kenneth  P.  and Anderson, David  R. },
	citeulike-article-id = {114715},
	howpublished = {Hardcover},
	isbn = {0387985042},
	keywords = {book, model-selection},
	month = {November},
	priority = {2},
	publisher = {{Springer-Verlag Telos}},
	title = {Model Selection and Inference: A Practical Information-Theoretic Approach},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387985042},
	year = {1998}
}


@inproceedings{citeulike:114712,
	abstract = {We show that some graphical models with no hidden variables including Bayesian networks with several families of local distributions are Curved Exponential Families (CEFs). We also show that Baysian networks with hidden variables, and several other types of graphical models including non-chordal undirected graphical models are Stratified Exponential Families (SEFs). In addition, we illustrate how one can automatically generate independence and non-independence constraints on the distributions over the observable variables implied by a Bayesian network with hidden variables. The relevance of these results for Bayesian model selection is examined.},
	author = {Geiger, Dan   and Meek, Christopher  },
	citeulike-article-id = {114712},
	journal = {UAI 98},
	keywords = {graphical, information-geometry, models},
	priority = {2},
	title = {Graphical Models and Exponential Families},
	url = {http://research.microsoft.com/research/pubs/view.aspx?tr\_id=123},
	year = {1998}
}


@article{citeulike:114710,
	abstract = {			This paper deals with the existence of maximum likelihood estimators for multi-dimensional exponential families, including curved exponential families. It first gives an algorithm for determining the MLE from the data. Then it establishes that when the parameter set is either open or relatively closed in the natural parameter set, the MLE of the parameter exists in the sense of Hoffmann-Jorgensen.},
	author = {Miao, W.  and Hahn, M. },
	citeulike-article-id = {114710},
	doi = {10.1111/1467-9469.00070},
	issn = {0303-6898},
	journal = {Scandinavian Journal of Statistics},
	keywords = {estimation, exponential-families},
	month = {September},
	number = {3},
	pages = {371--386},
	priority = {2},
	title = {Existence of Maximum Likelihood Estimates for Multi-dimensional Exponential Families},
	url = {http://dx.doi.org/10.1111/1467-9469.00070},
	volume = {24},
	year = {1997}
}


@article{citeulike:90021,
	abstract = {Traditional analyses of the curve fitting problem maintain that the data do not indicate what form the fitted curve should take. Rather, this issue is said to be settled by prior probabilities, by simplicity, or by a bacgkround theory. In this paper, we describe a result due to Akaike [1973], which shows how the data can underwrite an inference concerning the curve's form based on an estimate of how predictively accurate it will be. We argue that this approach throws light on the theoretical virtues of parsimoniousness, unification, and non ad hocness, on the dispute about Bayesianism, and on empiricism and scientific realism.},
	author = {Forster, Malcolm   and Sober, Elliott  },
	citeulike-article-id = {90021},
	journal = {The British Journal for the Philosophy of Science},
	keywords = {bayesian, model-selection, philosophy, regularization},
	number = {1},
	pages = {1--35},
	priority = {2},
	title = {How to Tell When Simpler, More Unified, or Less Ad Hoc Theories Will Provide More Accurate Predictions},
	url = {http://links.jstor.org/sici?sici=0007-0882\%28199403\%2945\%3A1\%3C1\%3AHTTWSM\%3E2.0.CO\%3B2-I},
	volume = {45},
	year = {1994}
}


@inproceedings{citeulike:114629,
	abstract = {Bayesian network models are widely used for discriminative
prediction tasks such as classification.
Usually their parameters are determined using ‘unsupervised’
methods such as maximization of the
joint likelihood. The reason is often that it is unclear
how to find the parameters maximizing the
conditional (supervised) likelihood. We show how
the discriminative learning problem can be solved
efficiently for a large class of Bayesian network
models, including the Naive Bayes (NB) and treeaugmented
Naive Bayes (TAN) models. We do this
by showing that under a certain general condition
on the network structure, the discriminative learning
problem is exactly equivalent to logistic regression
with unconstrained convex parameter spaces.
Hitherto this was known only for Naive Bayes models.
Since logistic regression models have a concave
log-likelihood surface, the global maximum
can be easily found by local optimization methods.},
	author = {Wettig and Grunwald and Roos},
	citeulike-article-id = {114629},
	comment = {Explicit formula for converting between BN and Logistic Regression parameters},
	journal = {IJCAI 2003},
	keywords = {bayesnet, generative-discriminative},
	priority = {2},
	title = {When Discriminative Learning of Bayesian Network Parameters Is Easy},
	url = {http://cosco.hiit.fi/Articles/ijcai03.pdf},
	year = {2003}
}


@inproceedings{citeulike:114627,
	author = {Grossman, Daniel   and Domingos, Pedro  },
	booktitle = {ICML '04: Twenty-first international conference on Machine learning},
	citeulike-article-id = {114627},
	doi = {10.1145/1015330.1015339},
	isbn = {1581138285},
	keywords = {bayesnet, generative-discriminative},
	priority = {2},
	publisher = {ACM Press},
	title = {Learning Bayesian network classifiers by maximizing conditional likelihood},
	url = {http://portal.acm.org/citation.cfm?id=1015339},
	year = {2004}
}


@inproceedings{citeulike:114614,
	author = {Greiner, Russell   and Zhou, Wei  },
	booktitle = {Eighteenth national conference on Artificial intelligence},
	citeulike-article-id = {114614},
	isbn = {0262511290},
	keywords = {bayesnet, generative-discriminative, naivebayes},
	pages = {167--173},
	priority = {2},
	publisher = {American Association for Artificial Intelligence},
	title = {Structural extension to logistic regression: discriminative parameter learning of belief net classifiers},
	url = {http://portal.acm.org/citation.cfm?id=777121},
	year = {2002}
}


@inproceedings{citeulike:110110,
	abstract = {We define the relevant information in a signal x 2 X as being the information that this signal provides about another signal y 2 Y . Examples include the information that face images provide about the names of the people portrayed, or the information that speech sounds provide about the words spoken. Understanding the signal x requires more than just predicting y, it also requires specifying which features of X play a role in the prediction. We formalize the problem as that of finding a short...},
	author = {Tishby, N.  and Pereira, F.  and Bialek, W. },
	booktitle = {Proceedings of the 37-th Annual Allerton Conference on Communication, Control and Computing},
	citeulike-article-id = {110110},
	comment = {Applying rate distortion theory to optimally quantize the input space, where labelled data is used to define the distortion measure (ie, low distortion if can predict Y with remaining information)},
	keywords = {information-theory, machine-learning},
	pages = {368--377},
	priority = {0},
	title = {The information bottleneck method},
	url = {http://citeseer.ist.psu.edu/tishby99information.html},
	year = {1999}
}


@misc{citeulike:113787,
	abstract = {Imagine being shown \$N\$ samples of random variables drawn independently from
the same distribution. What can you say about the distribution? In general, of
course, the answer is nothing, unless we have some prior notions about what to
expect. From a Bayesian point of view we need an {\it a priori} distribution on
the space of possible probability distributions, which defines a scalar field
theory. In one dimension, free field theory with a constraint provides a
tractable formulation of the problem, and we also discus generalizations to
higher dimensions.},
	author = {Bialek, William   and Callan, Curtis  G.  and Strong, S. P. },
	citeulike-article-id = {113787},
	comment = {Justifying estimation of a continuous probability density function from several datapoints in a non-parametric setting},
	eprint = {cond-mat/9607180},
	keywords = {nonparametric, physics},
	priority = {2},
	title = {Field Theories for Learning Probability Distributions},
	url = {http://arxiv.org/abs/cond-mat/9607180}
}


@misc{citeulike:113786,
	abstract = {We try to establish a unified information theoretic approach to learning and
to explore some of its applications. First, we define {\em predictive
information} as the mutual information between the past and the future of a
time series, discuss its behavior as a function of the length of the series,
and explain how other quantities of interest studied previously in learning
theory - as well as in dynamical systems and statistical mechanics - emerge
from this universally definable concept. We then prove that predictive
information provides the {\em unique measure for the complexity} of dynamics
underlying the time series and show that there are classes of models
characterized by {\em power-law growth of the predictive information} that are
qualitatively more complex than any of the systems that have been investigated
before. Further, we investigate numerically the learning of a nonparametric
probability density, which is an example of a problem with power-law
complexity, and show that the proper Bayesian formulation of this problem
provides for the `Occam' factors that punish overly complex models and thus
allow one {\em to learn not only a solution within a specific model class, but
also the class itself} using the data only and with very few a priori
assumptions. We study a possible {\em information theoretic method} that
regularizes the learning of an undersampled discrete variable, and show that
learning in such a setup goes through stages of very different complexities.
Finally, we discuss how all of these ideas may be useful in various problems in
physics, statistics, and, most importantly, biology.},
	author = {Nemenman, Ilya  },
	citeulike-article-id = {113786},
	eprint = {physics/0009032},
	keywords = {information-geometry, information-theory, physics},
	month = {September},
	priority = {2},
	title = {Information theory and learning: a physical approach},
	url = {http://arxiv.org/abs/physics/0009032},
	year = {2000}
}


@article{citeulike:113777,
	abstract = {We consider the problem of PAC learning probabilistic networks in the case where the structure of the
net is specified beforehand. We allow the conditional probabilities to be represented in any manner (as tables or
specialized functions) and obtain sample complexity bounds for learning nets with and without hidden nodes.},
	author = {Dasgupta, Sanjoy  },
	citeulike-article-id = {113777},
	comment = {This paper studies the number of examples required to learn Bayesian
networks when we know the network structure. The paper considers learning
both networks where all variables are observable and networks where some
variables are hidden.},
	issn = {0885-6125},
	journal = {Mach. Learn.},
	keywords = {bayesnet, mlrg, pac, statistical-learning-theory},
	number = {2-3},
	pages = {165--180},
	priority = {0},
	publisher = {Kluwer Academic Publishers},
	title = {The Sample Complexity of Learning Fixed-Structure Bayesian Networks},
	url = {http://portal.acm.org/citation.cfm?id=274162},
	volume = {29},
	year = {1997}
}


@techreport{citeulike:113400,
	author = {Richards, J. A. },
	citeulike-article-id = {113400},
	comment = {Shrinks towards specified point. Uniformly lower squared error for gaussians in 3 or more dimensions. Authors invented shrinkage.},
	journal = {M.I.T. EECS Area Exam Report},
	keywords = {estimation},
	month = {November},
	priority = {2},
	title = {An Introduction to James-Stein Estimation},
	url = {http://ssg.mit.edu/group/alumni/johnrich/index-details.html},
	year = {1999}
}


@inproceedings{citeulike:112859,
	abstract = {Abstract. We consider the problem of estimating an unknown probability distribution
from samples using the principle of maximum entropy (maxent). To
alleviate overfitting with a very large number of features, we propose applying
the maxent principle with relaxed constraints on the expectations of the features.
By convex duality, this turns out to be equivalent to finding the Gibbs distribution
minimizing a regularized version of the empirical log loss. We prove nonasymptotic
bounds showing that, with respect to the true underlying distribution,
this relaxed version of maxent produces density estimates that are almost
as good as the best possible. These bounds are in terms of the deviation of the
feature empirical averages relative to their true expectations, a number that can
be bounded using standard uniform-convergence techniques. In particular, this
leads to bounds that drop quickly with the number of samples, and that depend
very moderately on the number or complexity of the features. We also derive and
prove convergence for both sequential-update and parallel-update algorithms. Finally,
we briefly describe experiments on data relevant to the modeling of species
geographical distributions.},
	author = {Dudik, Miroslav   and Phillips, Steven  J.  and Schapire, Robert  E. },
	citeulike-article-id = {112859},
	journal = {COLT},
	keywords = {maxent, mlrg},
	priority = {2},
	title = {Performance Guarantees for Regularized
Maximum Entropy Density Estimation},
	url = {http://www.cs.princeton.edu/~mdudik/colt\_2004.pdf},
	year = {2004}
}


@inproceedings{citeulike:112854,
	author = {Phillips, Steven  J.  and Dudik, Miroslav   and Schapire, Robert  E. },
	booktitle = {ICML '04: Twenty-first international conference on Machine learning},
	citeulike-article-id = {112854},
	comment = {- Estimating bird density
- Does MaxEnt with range moment constraints instead of equality moment constraints.
- Scales each range by expected feature variance
- Uses sequential update for training
- Visualizes contribution of each feature to density},
	doi = {10.1145/1015330.1015412},
	isbn = {1581138285},
	keywords = {maxent, mlrg, regularization},
	priority = {0},
	publisher = {ACM Press},
	title = {A maximum entropy approach to species distribution modeling},
	url = {http://portal.acm.org/citation.cfm?id=1015330.1015412},
	year = {2004}
}


@article{citeulike:111909,
	author = {Bousquet, Olivier   and Elisseeff, Andr\&\#233;  },
	citeulike-article-id = {111909},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {regularization},
	pages = {499--526},
	priority = {2},
	publisher = {MIT Press},
	title = {Stability and generalization},
	url = {http://portal.acm.org/citation.cfm?id=944801},
	volume = {2},
	year = {2002}
}


@misc{citeulike:111901,
	author = {Cave, Carlton  },
	citeulike-article-id = {111901},
	keywords = {bayesian, philosophy},
	priority = {2},
	title = {Resource Material for Promoting the Bayesian View of Everything},
	url = {http://info.phys.unm.edu/~caves/thoughts2.2.pdf}
}


@article{citeulike:106713,
	abstract = {Fisher's contributions to statistics are surveyed. His background, skills, temperament, and style of thought and writing are sketched. His mathematical and methodological contributions are outlined. More attention is given to the technical concepts he introduced or emphasized, such as consistency, sufficiency, efficiency, information, and maximum likelihood. Still more attention is given to his conception and concepts of probability and inference, including likelihood, the fiducial argument, and hypothesis testing. Fisher is at once very near to and very far from modern statistical thought generally.},
	author = {Savage, Leonard  J. },
	citeulike-article-id = {106713},
	journal = {The Annals of Statistics},
	keywords = {statistics},
	number = {3},
	pages = {441--500},
	priority = {2},
	title = {On Rereading R. A. Fisher},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28197605\%294\%3A3\%3C441\%3AORRAF\%3E2.0.CO\%3B2-1},
	volume = {4},
	year = {1976}
}


@article{citeulike:106716,
	abstract = {A number of distinct roles are identified for probability models used in the analysis of data. Examples are outlined. Some general issues arising in the formulation of such models are discussed.},
	author = {Cox, D. R. },
	citeulike-article-id = {106716},
	journal = {Statistical Science},
	keywords = {philosophy, statistics},
	number = {2},
	pages = {169--174},
	priority = {2},
	title = {Role of Models in Statistical Analysis},
	url = {http://links.jstor.org/sici?sici=0883-4237\%28199005\%295\%3A2\%3C169\%3AROMISA\%3E2.0.CO\%3B2-K},
	volume = {5},
	year = {1990}
}


@article{citeulike:108434,
	abstract = {After some brief historical comments on statistical aspects of causality two current views are outlined and their limitations sketched. One definition is that causality is a statistical association that cannot be explained away by confounding variables and the other is based on a link with notions in the design of experiments. The importance of underlying processes or mechanisms is stressed. Implications for empirical statistical analysis are discussed.},
	author = {Cox, D. R. },
	citeulike-article-id = {108434},
	keywords = {philosophy, statistics},
	priority = {2},
	title = {Causality: Some Statistical Aspects},
	url = {http://links.jstor.org/sici?sici=0964-1998\%281992\%29155\%3A2\%3C291\%3ACSSA\%3E2.0.CO\%3B2-O}
}


@article{citeulike:108891,
	abstract = {The case of a singular Fisher information matrix (FIM) represents a significant complication for the theory of the Cramer-Rao lower bound (CRB) that is usually handled by resorting to the pseudoinverse of the Fisher matrix. We take a different approach in which the CRB is derived as the solution to an unconstrained quadratic maximization problem, which enables us to handle the singular case in a simple yet rigorous manner. When the Fisher matrix is singular, except under unusual circumstances, any estimator having the specified bias derivatives that figure in the CRB must have infinite variance},
	author = {Stoica, P.  and Marzetta, T. },
	citeulike-article-id = {108891},
	comment = {-Simplified derivation of Rao-Cramer bound.
-Conditions on when singularity of FIM implies infinite variance.
-Shows that Penrose-Moore pseudo-inverse gives an overly optimistic (loose) bound on variance for typical cases.},
	journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	keywords = {estimation, fisher-information},
	number = {1},
	pages = {87--90},
	priority = {0},
	title = {Parameter estimation problems with singular information matrices},
	url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=890346},
	volume = {49},
	year = {2001}
}


@article{citeulike:108704,
	author = {Saul, Lawrence  K.  and Roweis, Sam  T. },
	citeulike-article-id = {108704},
	comment = {Larger version of Science article, implementation details, interesting linear algebra},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {mlrg, unsupervised},
	pages = {119--155},
	priority = {2},
	publisher = {MIT Press},
	title = {Think globally, fit locally: unsupervised learning of low dimensional manifolds},
	url = {http://portal.acm.org/citation.cfm?id=945372},
	volume = {4},
	year = {2003}
}


@article{citeulike:108703,
	author = {Roweis, Sam  T.  and Saul, Lawrence  K. },
	citeulike-article-id = {108703},
	comment = {Represents each point as a convex combination of it's neighbours, then reconstructs points in a lower dimensional space so that this representation is preserved. Suprisingly the second part is reduced to a linear problem and solved using matrix operations.},
	doi = {10.1126/science.290.5500.2323},
	journal = {Science},
	keywords = {mlrg, unsupervised},
	month = {December},
	number = {5500},
	pages = {2323--2326},
	priority = {0},
	title = {Nonlinear Dimensionality Reduction by Locally Linear Embedding},
	url = {http://dx.doi.org/10.1126/science.290.5500.2323},
	volume = {290},
	year = {2000}
}


@article{citeulike:108686,
	abstract = {There are difficulties with probability as a representation of uncertainty. However, we argue that there is an important distinction between principle and practice. In principle, probability is uniquely appropriate for the representation and quantification of all forms of uncertainty; it is in this sense that we claim that 'probability is perfect'. In practice, people find it difficult to express their knowledge and beliefs in probabilistic form, so that elicitation of probability distributions is a far from perfect process. We therefore argue that there is no need for alternative theories, but that any practical elicitation of expert knowledge must fully acknowledge imprecision in the resulting distribution.We outline a recently developed Bayesian technique that allows the imprecision in elicitation to be formulated explicitly, and apply it to some of the challenge problems.},
	author = {O'Hagan, Anthony   and Oakley, Jeremy  E. },
	citeulike-article-id = {108686},
	doi = {10.1016/j.ress.2004.03.014},
	journal = {Reliability Engineering \& System Safety},
	keywords = {elicitation, ki},
	month = {},
	number = {1-3},
	pages = {239--248},
	priority = {2},
	title = {Probability is perfect, but we can't elicit it perfectly},
	url = {http://dx.doi.org/10.1016/j.ress.2004.03.014},
	volume = {85},
	year = {2004}
}


@book{citeulike:106699,
	abstract = {{A comprehensive look at learning and generalization theory. The statistical theory of learning and generalization concerns the problem of choosing desired functions on the basis of empirical data. Highly applicable to a variety of computer science and robotics fields, this book offers lucid coverage of the theory as a whole. Presenting a method for determining the necessary and sufficient conditions for consistency of learning process, the author covers function estimates from small data pools, applying these estimations to real-life problems, and much more.}},
	author = {Vapnik, Vladimir  N. },
	citeulike-article-id = {106699},
	howpublished = {Hardcover},
	isbn = {0471030031},
	keywords = {book, statistical-learning-theory, svm},
	month = {September},
	priority = {2},
	publisher = {Wiley-Interscience},
	title = {Statistical Learning Theory},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0471030031},
	year = {1998}
}


@techreport{citeulike:106398,
	abstract = {We formulate necessary and sufficient conditions for an arbitrary discrete probability distribution to factor according to an undirected graphical model, or a log-linear model, or other more general exponential models. This characterization generalizes the well-known Hammersley-Clifford Theorem. We show that for decomposable graphical models these conditions are equivalent to a set of statistical independence facts as in the Hammersley-Clifford Theorem but that for non-decomposable graphical models they are not. We also show that non-decomposable models can have non-rational maximum likelihood estimates. Finally, using these results, we provide a characterization of decomposable graphical models.},
	author = {Geiger, Dan   and Meek, Christopher   and Sturmfels, Bernd  },
	citeulike-article-id = {106398},
	comment = {(private-note)Defines exponential, loglinear, graphical models
---=note-separator=---
- Hammersley-Clifford related independence equations to the form of positive distributions satisfying those equations (factored into clique potentials).
- Distribution factors according to A if it's in the image of phi\_A
- Factorization theorem -- P factors according to A iff the support of P is nice and all polynomials in an ideal basis of the toric ideal I\_A vanish at P
- In addition to independence equations, "cross-product ratios" enter for non-decomposable models:
- ie, simplest non-decomposable model (4 loop) has following condition (in addition to independence equations and other cpr constraints):
p0100 p0111 p1001 p1010 = p0101 p0110 p1000 p1011},
	journal = {Microsoft Research},
	keywords = {algebra, exponential-families, geometry, graphical, loglinear},
	month = {February},
	priority = {0},
	title = {On the Toric Algebra of Graphical Models},
	url = {http://research.microsoft.com/research/pubs/view.aspx?tr\_id=560},
	volume = {MSR-TR-2002-47},
	year = {2002}
}


@article{citeulike:106385,
	abstract = {A class of log-linear models, referred to as labelled graphical models (LGMs), is introduced for multinomial distributions. These models generalize graphical models (GMs) by employing partial conditional independence restrictions which are valid only in subsets of an outcome space. Theoretical results concerning model identifiability, decomposability and estimation are derived. A decision theoretical framework and a search algorithm for the identification of plausible models are described. Real data sets are used to illustrate that LGMs may provide a simpler interpretation of a dependence structure than GMs.},
	author = {Corander, Jukka  },
	citeulike-article-id = {106385},
	journal = {Scandinavian Journal of Statistics},
	keywords = {bayesnet, graphical, maxent},
	priority = {2},
	title = {Labelled Graphical Models},
	url = {http://www.blackwell-synergy.com/links/doi/10.1111/1467-9469.00344},
	year = {2003}
}


@inproceedings{citeulike:106358,
	abstract = {Statistical models of word-sense disam-
biguation are often based on a small num-
ber of contextual features or on a model
that is assumed to characterize the inter-
actions among a set of features. Model
selection is presented as an alternative to
these approaches, where a sequential search
of possible models is conducted in order to
find the model that best characterizes the
interactions among features. This paper
expands existing model selection method-
ology and presents the first comparative
study of model selection search strategies
and evaluation criteria when applied to the
problem of building probabilistic classifiers
for word-sense disambiguation.},
	author = {Pedersen and Bruce},
	citeulike-article-id = {106358},
	journal = {14th National Conference on Artificial Intelligence},
	keywords = {maxent},
	priority = {2},
	title = {A New Supervised Learning Algorithm for Word Sense Disambiguation.},
	url = {http://acl.ldc.upenn.edu/A/A97/A97-1056.pdf},
	year = {1997}
}


@article{citeulike:106350,
	abstract = {The maximum entropy framework has proved to be expressive and powerful for statistical language modelling, but it suffers from the computational expensiveness of model building. The iterative scaling algorithm that is used for parameter estimation is rather slow while the feature selection process might require parameters for many candidate features to be estimated many times. In this paper we present a novel approach for building maximum entropy models. Our approach uses a feature collocation lattice as a feature generation engine and selects candidate features without resorting to iterative scaling but instead through our own frequency redistribution algorithm. After the candidate features have been selected we use iterative scaling to estimate a fully saturated model for the maximal (factorial) feature space and then start to relax (eliminate) the most specific features. During constraint relaxation we always have a fully fit maximum entropy model, so we rank the constraints on the basis of their weights in the model. Therefore we don't have to use iterative scaling for the constraint ranking. Another important improvement in the efficiency of the model search is that we can use the weights of the old model as the initial values for the weights in the estimation of the new one since the simplified model deviates from its larger model only in a small number of constraints. This has proved to decrease the number of required iterations about tenfold. The proposed method is especially suitable for the tasks of Natural Language Modelling because it can successfully deal with data sparseness, overcome the assumption of independence, and deal with overlapping knowledge sources. In terms of practical results we discuss how our method has been applied to several language modelling tasks including document categorization, sentence boundary disambiguation, and part-of-speech tagging.},
	author = {Mikheev, Andrei  },
	citeulike-article-id = {106350},
	comment = {- Hard to read.
- Some sort maxent training + feature induction that avoids iterative scaling.},
	journal = {Journal for Natural Language Engineering},
	keywords = {maxent},
	priority = {1},
	title = {Feature Lattices and Maximum Entropy Models},
	url = {http://www.ltg.ed.ac.uk/~mikheev/papers.html\#kluwer98},
	year = {1998}
}


@incollection{NIPS2005_238,
	abstract = {In this paper, we address the problem of statistical learning for multitopic text categorization (MTC), whose goal is to choose all relevant topics (a label) from a given set of topics. The proposed algorithm, Maximal Margin Labeling (MML), treats all possible labels as independent classes and learns a multi-class classifier on the induced multi-class categorization problem. To cope with the data sparseness caused by the huge number of possible labels, MML combines some prior knowledge about label prototypes and a maximal margin criterion in a novel way. Experiments with multi-topic Web pages show that MML outperforms existing learning algorithms including Support Vector Machines.},
	address = {Cambridge, MA},
	author = {Kazawa, Hideto   and Izumitani, Tomonori   and Taira, Hirotoshi   and Maeda, Eisaku  },
	booktitle = {Advances in Neural Information Processing Systems 17},
	citeulike-article-id = {106310},
	comment = {As an example -- qbit doesn't significantly corellate with label "Computer Science" nor with "Quantum Mechanics" but it significantly corellates with group that's the intersection of two labels. },
	editor = {Saul, Lawrence  K.  and Weiss, Yair   and Bottou, L\'{e}on  },
	keywords = {svm},
	priority = {2},
	publisher = {MIT Press},
	title = {Maximal Margin Labeling for Multi-Topic Text Categorization},
	year = {2005}
}


@article{citeulike:106267,
	abstract = {There are two cultures in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated by a given stochastic data model. The other uses algorithmic models and treats the data mechanism as unknown. The statistical community has been committed to the almost exclusive use of data models. This commitment has led to irrelevant theory, questionable conclusions, and has kept statisticians from working on a large range of interesting current problems. Algorithmic modeling, both in theory and practice, has developed rapidly in fields outside statistics. It can be used both on large complex data sets and as a more accurate and informative alternative to data modeling on smaller data sets. If our goal as a field is to use data to solve problems, then we need to move away from exclusive dependence on data models and adopt a more diverse set of tools.},
	author = {Breiman, Leo  },
	citeulike-article-id = {106267},
	comment = {Talks about classical statistics (one's who assume model to start with) vs. algorithmic modelling. Good example with variable selection -- doing standard logistic regression and then looking at coefficients shows 7 out of 11 variables important. Alternative approach, fitting a random forest to data where i'th variable has noise added, shows only 2 variables to be important. Also shows that algorithmic modelling with robustness (ie boosting) can handle lots of variables with little data much better than model fitting.

Paper "Maximal Margin Labeling for Multi-Topic Text Categorization" gives an example where algorithmic modelling can discover structure that classical statistical modelling can't},
	journal = {Statistical Science},
	keywords = {philosophy, statistics},
	priority = {0},
	title = {Statistical Modeling: The Two Cultures},
	url = {http://projecteuclid.org/Dienst/UI/1.0/Summarize/euclid.ss/1009213726},
	year = {2001}
}


@article{citeulike:105632,
	abstract = {A new approach to clustering based on statistical physics is presented. The problem is formulated as fuzzy clustering and the association probability distribution is obtained by maximizing the entropy at a given average variance. The corresponding Lagrange multiplier is related to the temperature and motivates a deterministic annealing process where the free energy is minimized at each temperature. Critical temperatures are derived for phase transitions when existing clusters split. It is a hierarchical clustering estimating the most probable cluster parameters at various average variances.},
	author = {Rose and Gurewitz and Fox},
	citeulike-article-id = {105632},
	comment = {They define the cost of the clustering as sum of squared distances from cluster centers. Then they use principle of maximum entropy to find highest entropy cluster assignment for given total cost. Gradually lowering the cost becomes analogous to simulated annealing. For high total cost, there's one cluster, but for the opposite -- every point gets it's own cluster. While changing temperature is continuous, there are several discrete "phase transitions" when the number of clusters changes. This defines a hierarchy of clusterings for given data.},
	journal = {Physical Review Letters},
	keywords = {clustering, information-theory, maxent, physics},
	month = {May},
	number = {8},
	pages = {945--948},
	priority = {0},
	title = {Statistical mechanics and phase transitions in clustering.},
	url = {http://oasis.oregonstate.edu/search/tphysical review letters/tphysical review letters/1,2,3,E/l856\&\#38;FF=tphysical review letters online\&\#38;1,1,,1,0},
	volume = {65},
	year = {1990}
}


@misc{citeulike:105611,
	abstract = {A given question can be defined in terms of the set of statements or
assertions that answer it. Application of logical inference to these sets of
assertions allows one to derive the logic of inquiry among questions. There are
interesting symmetries between the logics of inference and inquiry; where
probability describes the degree to which a premise implies an assertion, there
exists an analogous measure that describes the bearing or relevance that a
question has on an outstanding issue. These have been extended to suggest that
the logic of inquiry results in functional relationships analogous to, although
more general than, those found in information theory.


Employing lattice theory, I examine in greater detail the structure of the
space of assertions and questions demonstrating that the symmetries between the
logical relations in each of the spaces derive directly from the lattice
structure. Furthermore, I show that while symmetries between the spaces exist,
the two lattices are not isomorphic. The lattice of assertions is described by
a Boolean lattice 2^N, whereas the lattice of assuredly real questions is shown
to be a sublattice of the free distributive lattice 2^(2^N). Thus there does
not exist a one-to-one mapping of assertions to questions, there is no
reflection symmetry between the two spaces, and questions in general do not
possess complements. Last, with these lattice structures in mind, I discuss the
relationship between probability, relevance, and entropy.},
	author = {Knuth, Kevin  H. },
	citeulike-article-id = {105611},
	eprint = {physics/0403089},
	keywords = {feature-selection, maxent, philosophy},
	month = {March},
	priority = {2},
	title = {What is a Question?},
	url = {http://arxiv.org/abs/physics/0403089},
	year = {2004}
}


@misc{citeulike:105610,
	abstract = {The Boolean lattice of logical statements induces the free distributive
lattice of questions. Inclusion on this lattice is based on whether one
question answers another. Generalizing the zeta function of the question
lattice leads to a valuation called relevance or bearing, which is a measure of
the degree to which one question answers another. Richard Cox conjectured that
this degree can be expressed as a generalized entropy. With the assistance of
yet another important result from Janos Aczel, I show that this is indeed the
case, and that the resulting inquiry calculus is a natural generalization of
information theory. This approach provides a new perspective on the Principle
of Maximum Entropy.},
	author = {Knuth, Kevin  H. },
	citeulike-article-id = {105610},
	eprint = {physics/0409084},
	keywords = {maxent, philosophy},
	month = {September},
	priority = {2},
	title = {Measuring questions: relevance and its relation to entropy},
	url = {http://arxiv.org/abs/physics/0409084},
	year = {2004}
}


@misc{citeulike:105609,
	abstract = {What is a question? According to Cox a question can be identified with the
set of assertions that constitute possible answers. In this paper we propose a
different approach that combines the notion that questions are requests for
information with the notion that probability distributions represent
uncertainties resulting from lack of information. This suggests that to each
probability distribution one can naturally associate that particular question
which requests the information that is missing and vice-versa. We propose to
represent questions q by probability distributions Next we consider how
questions relate to each other: to what extent is finding the answer to one
question relevant to answering another? A natural measure of relevance is
derived by requiring that it satisfy three desirable features (three axioms).
We find that the relevance of a question q to another question Q turns out to
be the relative entropy S[q,Q] of the corresponding distributions. An
application to statistical physics is briefly considered.},
	author = {Caticha, Ariel  },
	citeulike-article-id = {105609},
	comment = {A question is a probability distribution},
	eprint = {cond-mat/0409175},
	keywords = {information-geometry, maxent},
	month = {September},
	priority = {2},
	title = {Questions, relevance and relative entropy},
	url = {http://arxiv.org/abs/cond-mat/0409175},
	year = {2004}
}


@misc{citeulike:105608,
	abstract = {We discuss how the method of maximum entropy, MaxEnt, can be extended beyond
its original scope, as a rule to assign a probability distribution, to a
full-fledged method for inductive inference. The main concept is the (relative)
entropy S[p|q] which is designed as a tool to update from a prior probability
distribution q to a posterior probability distribution p when new information
in the form of a constraint becomes available. The extended method goes beyond
the mere selection of a single posterior p, but also addresses the question of
how much less probable other distributions might be. Our approach clarifies how
the entropy S[p|q] is used while avoiding the question of its meaning.
Ultimately, entropy is a tool for induction which needs no interpretation.
Finally, being a tool for generalization from special examples, we ask whether
the functional form of the entropy depends on the choice of the examples and we
find that it does. The conclusion is that there is no single general theory of
inductive inference and that alternative expressions for the entropy are
possible.},
	author = {Caticha, Ariel  },
	citeulike-article-id = {105608},
	doi = {10.1063/1.1751358},
	eprint = {physics/0311093},
	keywords = {information-geometry, maxent},
	month = {November},
	priority = {2},
	title = {Relative Entropy and Inductive Inference},
	url = {http://scitation.aip.org/getabs/servlet/GetabsServlet?prog=normal\&id=APCPCS000707000001000075000001\&idtype=cvips\&gifs=yes},
	year = {2003}
}


@book{citeulike:105607,
	abstract = {{In the theory and practice of econometrics the model, the method and the data are all interdependent links in information recovery-estimation and inference. Seldom, however, are the economic and statistical models correctly specified, the data complete or capable of being replicated, the estimation rules \&\#145;optimal and the inferences free of distortion. Faced with these problems, Maximum Entropy Economeirics provides a new basis for learning from economic and statistical models that may be non-regular in the sense that they are ill-posed or underdetermined and the data are partial or incomplete. By extending the maximum entropy formalisms used in the physical sciences, the authors present a new set of generalized entropy techniques designed to recover information about economic systems. The authors compare the generalized entropy techniques with the performance of the relevant traditional methods of information recovery and clearly demonstrate theories with applications including<UL><LI>Pure inverse problems that include first order Markov processes, and input-output, multisectoral or SAM models to<LI>Inverse problems with noise that include statistical models subject to ill-conditioning, non-normal errors, heteroskedasticity, autocorrelation, censored, multinomial and simultaneous response data, as well as model selection and non-stationary and dynamic control problems</UL>Maximum Entropy Econometrics will be of interest to econometricians trying to devise procedures for recovering information from partial or incomplete data, as well as quantitative economists in finance and business, statisticians, and students and applied researchers in econometrics, engineering and the physical sciences.    }},
	author = {Judge, George   and Miller, Douglas   and Golan},
	citeulike-article-id = {105607},
	comment = {- Mostly on continuous valued inverse problems, pure and noisy.
- Two chapters on discrete stuff. 
- "Random Variables are merely conceptual devices used to express the prior and sample knowledge in a mutually compatible format."},
	howpublished = {Hardcover},
	isbn = {0471953113},
	keywords = {book, maxent},
	month = {March},
	priority = {2},
	publisher = {{John Wiley \& Sons}},
	title = {Maximum Entropy Econometrics: Robust Estimation With Limited Data},
	url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0471953113},
	year = {1996}
}


@article{citeulike:105606,
	author = {Golan},
	citeulike-article-id = {105606},
	comment = {Introduces uncertainty in the constraints},
	journal = {Journal of American Statistical Association},
	keywords = {maxent},
	priority = {2},
	title = {A maximum entropy approach to recovering information from multinomial response data},
	url = {http://www.google.com/search?hl=en\&\#38;lr=\&\#38;q=\&\#34;maximum entropy approach to recovering information from multinomial response data\&\#34;\&\#38;btnG=Search},
	year = {1996}
}


@article{citeulike:105605,
	abstract = {The need for inducing a probability distribution from partial data and the complementary problem of the analysis of an observed distribution in terms of fewer relevant variables occur in many branches of physics. For reproducible experiments, consistency conditions which must be satisfied by any algorithm for inferring a discrete probability distribution with given averages are formulated. The only consistent algorithm is the one leading to the distribution of maximal entropy subject to the given constraints.},
	author = {Tishby and Levine and Tikochinsky},
	citeulike-article-id = {105605},
	journal = {Physics Review Letters},
	keywords = {maxent},
	priority = {2},
	title = {Consistent inference of probabilities for reproducible experiments},
	url = {http://oasis.oregonstate.edu/search/tphysical review letters/tphysical review letters/1,2,3,E/l856\&\#38;FF=tphysical review letters\&\#38;2,,2,1,0},
	year = {1984}
}


@article{citeulike:105604,
	abstract = {A semiparametric estimator for evaluating the parameters of data generated under a sample selection process is developed. This estimator is based on the generalized maximum entropy estimator and performs well for small and ill-posed samples. Theoretical and sampling comparisons with parametric and semiparametric estimators are given. This method and standard ones are applied to three small-sample empirical applications of the wage-participation model for female teenage heads of households, immigrants, and Native Americans.},
	author = {Golan, Amos   and Perloff, Jeffrey  M. },
	citeulike-article-id = {105604},
	keywords = {maxent},
	priority = {2},
	title = {A Small-Sample Estimator for the Sample-Selection Model},
	url = {http://ideas.repec.org/p/cdl/agrebk/11033.html},
	year = {2001}
}


@article{citeulike:105599,
	abstract = {The minimum distance approach for reconstructing a positive function based on knowledge of finitely many linear functional values is examined. Two important classes of directed distances for signal processing and statistical inference are discussed. By imposing conditions analogous to those satisfied by linear projections in Hilbert space, two logarithmic entropy principles are derived. One of these involves the Itakura–Saito distortion measure of communication theory and uniquely extends Burg's maximum entropy method to incorporate prior knowledge. The other uses the Kullback–Leibler distance of statistics.},
	author = {Jones, Lee  },
	citeulike-article-id = {105599},
	journal = {Siam Applied Math},
	keywords = {maxent},
	priority = {2},
	title = {Approximation–Theoretic Derivation of Logarithmic Entropy Principles for Inverse Problems and Unique Extension of the Maximum Entropy Method to Incorporate Prior Knowledge},
	url = {http://locus.siam.org/SIAP/volume-49/art\_0149037.html},
	year = {1989}
}


@article{citeulike:105598,
	abstract = {A simple method is derived for computing state probabilities of a system when the probabilities of certain aggregate states are known. The method is based on maximizing the system entropy. It is shown that the results obtained by the method satisfy certain assumptions on statistical independence between events. The method is applied to a problem arising in computer performance analysis.},
	author = {Bard, Yonathan  },
	citeulike-article-id = {105598},
	journal = {IBM Journal of Research and Development},
	keywords = {maxent},
	priority = {2},
	title = {Estimation of State Probabilities using the Maximum Entropy principle},
	url = {http://www.research.ibm.com/journal/rd/245/ibmrd2405F.pdf},
	year = {1980}
}


@article{citeulike:104385,
	abstract = {Prediction involves estimating the unknown value of an attribute of a system under study given the
values of other measured attributes. In prediction (machine) learning the prediction rule is derived
from data consisting of previously solved cases. Most methods for predictive learning were originated
many years ago at the dawn of the computer age. Recently two new techniques have emerged that
have revitalized the Øeld. These are support vector machines and boosted decision trees. This
paper provides an introduction to these two new methods tracing their respective ancestral roots to
standard kernel methods and ordinary decision trees.},
	author = {Friedman, Jerome  H. },
	citeulike-article-id = {104385},
	keywords = {svm},
	priority = {2},
	title = {RECENT ADVANCES in PREDICTIVE (MACHINE) LEARNING},
	url = {http://www-stat.stanford.edu/~jhf/ftp/machine.pdf}
}


@unpublished{citeulike:104383,
	abstract = {This issue Scott Aaronson writes quite an interesting (and opinionated) column on whether the P = NP question is independent of the usual axiom systems. Enjoy!},
	author = {Fortnow, Lance   and Aaronson, Scott  },
	citeulike-article-id = {104383},
	comment = {If P!=NP is independent, then 
either 
1) P!=NP but there's no proof in the given axiom system
or
2) there's a polynomial-time algorithm for SAT but no proof of its correctness},
	keywords = {computational-complexity},
	priority = {2},
	title = {The Computational Complexity Column},
	url = {http://theorie.informatik.uni-ulm.de/Personen/toran/beatcs/column81.pdf}
}


@inproceedings{citeulike:104382,
	abstract = {In this paper, we study a family of semi-supervised learning algorithms for "aligning" different data sets that are characterized by the same underlying manifold. The optimizations of these algorithms are based on graphs that provide a discretized approximation to the manifold. Partial alignments of the data sets -- obtained from prior knowledge of their manifold structure or from pairwise correspondences of subsets of labeled examples -- are completed by integrating supervised signals with unsupervised frameworks for manifold learning. As an illustration of this semisupervised setting, we show how to learn mappings between different data sets of images that are parameterized by the same underlying modes of variability (e.g., pose and viewing angle). The curse of dimensionality in these problems is overcome by exploiting the low dimensional structure of image manifolds.},
	author = {Ham and Lee and Saul},
	citeulike-article-id = {104382},
	journal = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
	keywords = {manifolds, semisupervised, statistics},
	priority = {2},
	title = {Semisupervised alignment of manifolds},
	url = {http://www.cis.upenn.edu/~lsaul/abstracts.html\#semi\_aistats05},
	year = {2005}
}


@article{citeulike:103855,
	abstract = {A family of kernels for statistical learning is introduced that exploits the geometric structure of statistical models. The kernels are based on the heat equation on the Riemannian manifold defined by the Fisher information metric associated with a statistical family, and generalize the Gaussian kernel of Euclidean space. As an important special case, kernels based on the geometry of multinomial families are derived, leading to kernel-based learning algorithms that apply naturally to discrete data. Bounds on covering numbers and Rademacher averages for the kernels are proved using bounds on the eigenvalues of the Laplacian on Riemannian manifolds. Experimental results are presented for document classification, for which the use of multinomial geometry is natural and well motivated, and improvements are obtained over the standard use of Gaussian or linear kernels, which have been the standard for text classification.},
	author = {Lafferty, John   and Lebanon, Guy  },
	citeulike-article-id = {103855},
	issn = {1533-7928},
	journal = {J. Mach. Learn. Res.},
	keywords = {information-geometry, statistics},
	pages = {129--163},
	priority = {2},
	publisher = {MIT Press},
	title = {Diffusion Kernels on Statistical Manifolds},
	url = {http://portal.acm.org/citation.cfm?id=1046920.1046925},
	volume = {6},
	year = {2005}
}


@techreport{citeulike:103751,
	abstract = {This report describes Lagrange multipliers and some selected subtopics from matrix analysis from a machine learning perspective. The goal is to give a detailed description of a number of mathematical constructions that are widely used in applied machine learning.},
	author = {Burges, Christopher  J. },
	citeulike-article-id = {103751},
	comment = {Lagrange multipliers, Calculus of Variations, Matrix Dual Basis, Levi-Civita symbol, SVD in 7 steps},
	keywords = {math, notes},
	month = {June},
	priority = {3},
	publisher = {Springer-Verlag},
	title = {Some Notes on Applied Mathematics for Machine Learning},
	url = {http://research.microsoft.com/research/pubs/view.aspx?type=Technical Report\&\#38;id=760},
	volume = {MSR-TR-2004-56},
	year = {2004}
}


@misc{citeulike:103750,
	abstract = {I define a natural measure of the complexity of a parametric distribution
relative to a given true distribution called the {\it razor} of a model family.
The Minimum Description Length principle (MDL) and Bayesian inference are shown
to give empirical approximations of the razor via an analysis that
significantly extends existing results on the asymptotics of Bayesian model
selection. I treat parametric families as manifolds embedded in the space of
distributions and derive a canonical metric and a measure on the parameter
manifold by appealing to the classical theory of hypothesis testing. I find
that the Fisher information is the natural measure of distance, and give a
novel justification for a choice of Jeffreys prior for Bayesian inference. The
results of this paper suggest corrections to MDL that can be important for
model selection with a small amount of data. These corrections are interpreted
as natural measures of the simplicity of a model family. I show that in a
certain sense the logarithm of the Bayesian posterior converges to the
logarithm of the {\it razor} of a model family as defined here. Close
connections with known results on density estimation and ``information
geometry'' are discussed as they arise.},
	author = {Balasubramanian, Vijay  },
	citeulike-article-id = {103750},
	comment = {Same idea as in "Counting probability distributions" paper, but more in depth. Goes into measure theory, and derivation of the geometry of the probabilistic manifold. Some geometric background. No experimental results. I guess that's normal for a mathematical physics paper.},
	eprint = {adap-org/9601001},
	keywords = {information-geometry, statistics},
	month = {January},
	priority = {0},
	title = {A Geometric Formulation of Occam's Razor for Inference of Parametric Distributions},
	url = {http://arxiv.org/abs/adap-org/9601001},
	year = {1996}
}


@article{citeulike:102807,
	abstract = {The principle of maximum entropy is a method for assigning values to probability distributions on the basis of partial information. In usual formulations of this and related methods of inference one assumes that this partial information takes the form of a constraint on allowed probability distributions. In practical applications, however, the information consists of empirical data. A constraint rule is then employed to these data. Usually one adopts the rule to equate the expectation values of certain functions with their empirical averages. There are, however, various other ways in which one can construct constraints from empirical data, which makes the maximum entropy principle lead to very different probability assignments. This paper shows that an argument by Jaynes to justify the usual constraint rule is unsatisfactory and investigates several alternative choices. The choice of a constraint rule is also shown to be of crucial importance to the debate on the question whether there is a conflict between the methods of inference based on maximum entropy and Bayesian conditionalization.},
	author = {Uffink},
	citeulike-article-id = {102807},
	journal = {Studies in History and Philosophy of Modern Physics},
	keywords = {maxent, statistics},
	pages = {47--79},
	priority = {0},
	title = {The constraint rule of the maximum entropy principle},
	url = {http://www.phys.uu.nl/~wwwgrnsl/jos/mep2def/mep2def.html},
	volume = {27},
	year = {1996}
}


@article{citeulike:102806,
	abstract = {The principle of maximum entropy is a general method to assign values to probability distributions on the basis of partial information. This principle, introduced by Jaynes in 1957, forms an extension of the classical principle of insufficient reason. It has been further generalized, both in mathematical formulation and in intended scope, into the principle of maximum relative entropy or of minimum information. It has been claimed that these principles are singled out as unique methods of statistical inference that agree with certain compelling consistency requirements. This paper reviews these consistency arguments and the surrounding controversy. It is shown that the uniqueness proofs are flawed, or rest on unreasonably strong assumptions. A more general class of inference rules, maximizing the so-called R\'{e}nyi entropies, is exhibited which also fulfill the reasonable part of the consistency assumptions.},
	author = {Uffink, Jos  },
	citeulike-article-id = {102806},
	keywords = {maxent, statistics},
	month = {February},
	priority = {0},
	title = {Can the maximum entropy principle be explained as a consistency requirement?},
	url = {http://www.phys.uu.nl/~wwwgrnsl/jos/mepabst/mepabst.html},
	year = {1997}
}


@article{citeulike:90008,
	abstract = {The problem of selecting one of a number of models of different dimensions is treated by finding its Bayes solution, and evaluating the leading terms of its asymptotic expansion. These terms are a valid large-sample criterion beyond the Bayesian context, since they do not depend on the a priori distribution.},
	author = {Schwarz, Gideon  },
	citeulike-article-id = {90008},
	journal = {The Annals of Statistics},
	keywords = {model-selection, statistics},
	number = {2},
	pages = {461--464},
	priority = {2},
	title = {Estimating the Dimension of a Model},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28197803\%296\%3A2\%3C461\%3AETDOAM\%3E2.0.CO\%3B2-5},
	volume = {6},
	year = {1978}
}


@article{citeulike:70828,
	abstract = {When the probability of measuring a particular value of some quantity varies
inversely as a power of that value, the quantity is said to follow a power law,
also known variously as Zipf's law or the Pareto distribution. Power laws
appear widely in physics, biology, earth and planetary sciences, economics and
finance, computer science, demography and the social sciences. For instance,
the distributions of the sizes of cities, earthquakes, forest fires, solar
flares, moon craters and people's personal fortunes all appear to follow power
laws. The origin of power-law behaviour has been a topic of debate in the
scientific community for more than a century. Here we review some of the
empirical evidence for the existence of power-law forms and the theories
proposed to explain them.},
	author = {Newman, M. E. J. },
	citeulike-article-id = {70828},
	eprint = {cond-mat/0412004},
	keywords = {statistics},
	month = {December},
	priority = {2},
	title = {Power laws, Pareto distributions and Zipf's law},
	url = {http://arxiv.org/abs/cond-mat/0412004},
	year = {2004}
}


@article{citeulike:96189,
	author = {Hoeffding, Wassily  },
	citeulike-article-id = {96189},
	comment = {No good definition of "best" non-parametric test. Says desireable requirements are unbiasedness and consistency...what about power?

Introduces a rank test for independence, proves asymptotic consistency, normality. Proves non-existance of unbiased test for independence.},
	journal = {The Annals of Mathematical Statistics},
	keywords = {statistics},
	number = {4},
	pages = {546--557},
	priority = {2},
	title = {A Non-Parametric Test of Independence},
	url = {http://links.jstor.org/sici?sici=0003-4851\%28194812\%2919\%3A4\%3C546\%3AANTOI\%3E2.0.CO\%3B2-O},
	volume = {19},
	year = {1948}
}


@article{citeulike:100985,
	author = {Efron, Bradley  },
	citeulike-article-id = {100985},
	journal = {The Annals of Statistics},
	keywords = {exponential-families, information-geometry, statistics},
	number = {2},
	pages = {362--376},
	priority = {2},
	title = {The Geometry of Exponential Families},
	url = {http://links.jstor.org/sici?sici=0090-5364\%28197803\%296\%3A2\%3C362\%3ATGOEF\%3E2.0.CO\%3B2-Y},
	volume = {6},
	year = {1978}
}


@article{citeulike:86999,
	abstract = {This talk reviews some mathematical and physical ideas related to the notion
of dimension. After a brief historical introduction, various modern
constructions from fractal geometry, noncommutative geometry, and theoretical
physics are invoked and compared.},
	author = {Manin, Yuri  I. },
	citeulike-article-id = {86999},
	comment = {All algebraic geometry, not analytic geometry.},
	eprint = {math.AG/0502016},
	keywords = {algebra, geometry, math},
	month = {February},
	priority = {1},
	title = {The notion of dimension in geometry and algebra},
	url = {http://arxiv.org/abs/math.AG/0502016},
	year = {2005}
}


@article{citeulike:93879,
	abstract = {			We describe a Bayesian approach for learning Bayesian networks from a combination of prior knowledge and statistical data. First and foremost, we develop a methodology for assessing informative priors needed for learning. Our approach is derived from a set of assumptions made previously as well as the assumption of likelihood equivalence, which says that data should not help to discriminate network structures that represent the same assertions of conditional independence. We show that likelihood equivalence when combined with previously made assumptions implies that the user\&\#039;s priors for network parameters can be encoded in a single Bayesian network for the next case to be seen\&\#151;a prior network\&\#151;and a single measure of confidence for that network. Second, using these priors, we show how to compute the relative posterior probabilities of network structures given data. Third, we describe search methods for identifying network structures with high posterior probabilities. We describe polynomial algorithms for finding the highest-scoring network structures in the special case where every node has at most k = 1 parent. For the general case (k \&\#062; 1), which is NP-hard, we review heuristic search algorithms including local search, iterative local search, and simulated annealing. Finally, we describe a methodology for evaluating Bayesian-network learning algorithms, and apply this approach to a comparison of various approaches.},
	author = {Heckerman, D.  and Geiger, D.  and Chickering, D. M. },
	citeulike-article-id = {93879},
	issn = {0885-6125},
	journal = {Machine Learning},
	keywords = {bayesian, bayesnet, statistics},
	month = {September},
	number = {3},
	pages = {197--243},
	priority = {2},
	title = {Learning Bayesian Networks: The Combination of Knowledge and Statistical Data},
	url = {http://www.ingentaconnect.com/content/klu/mach/1995/00000020/00000003/00422402},
	volume = {20},
	year = {1995}
}


@article{citeulike:100137,
	abstract = {Developing theoretical foundations for learning is a key step towards understanding intelligence. 'Learning from examples' is a paradigm in which systems (natural or artificial) learn a functional relationship from a training set of examples. Within this paradigm, a learning algorithm is a map from the space of training sets to the hypothesis space of possible functional solutions. A central question for the theory is to determine conditions under which a learning algorithm will generalize from its finite training set to novel examples. A milestone in learning theory was a characterization of conditions on the hypothesis space that ensure generalization for the natural class of empirical risk minimization (ERM) learning algorithms that are based on minimizing the error on the training set. Here we provide conditions for generalization in terms of a precise stability property of the learning process: when the training set is perturbed by deleting one example, the learned hypothesis does not change much. This stability property stipulates conditions on the learning map rather than on the hypothesis space, subsumes the classical theory for ERM algorithms, and is applicable to more general algorithms. The surprising connection between stability and predictivity has implications for the foundations of learning theory and for the design of novel algorithms, and provides insights into problems as diverse as language learning and inverse problems in physics and engineering.},
	address = {Center for Biological and Computational Learning, McGovern Institute Computer Science Artificial Intelligence Laboratory, Brain Sciences Department, MIT, Cambridge, Massachusetts 02139, USA. tp@ai.mit.edu},
	author = {Poggio, T.  and Rifkin, R.  and Mukherjee, S.  and Niyogi, P. },
	citeulike-article-id = {100137},
	comment = {This is a Computational Learning Theory Paper, why was it published in Nature?!? },
	doi = {10.1038/nature02341},
	issn = {1476-4687},
	journal = {Nature},
	keywords = {colt, machine-learning},
	month = {March},
	number = {6981},
	pages = {419--422},
	priority = {2},
	title = {General conditions for predictivity in learning theory.},
	url = {http://dx.doi.org/10.1038/nature02341},
	volume = {428},
	year = {2004}
}


@article{citeulike:100669,
	abstract = {The problem of dimensionality reduction arises in many fields of information processing, including machine learning, data compression, scientific visualization, pattern recognition, and neural computation. Here we describe locally linear embedding (LLE), an unsupervised learning algorithm that computes low dimensional, neighborhood preserving embeddings of high dimensional data. The data, assumed to be sampled from an underlying manifold, are mapped into a single global coordinate system of lower dimensionality. The mapping is derived from the symmetries of locally linear reconstructions, and the actual computation of the embedding reduces to a sparse eigenvalue problem. Notably, the optimizations in LLE--though capable of generating highly nonlinear embeddings--are simple to implement, and they do not involve local minima. In this paper, we describe the implementation of the algorithm in detail and discuss several extensions that enhance its performance. We present results of the algorithm applied to data sampled from known manifolds, as well as to collections of images of faces, lips, and handwritten digits. These examples are used to provide extensive illustrations of the algorithm’s performance--both successes and failures--and to relate the algorithm to previous and ongoing work in nonlinear dimensionality reduction.},
	author = {Saul, Lawrence  K.  and Roweis, Sam  T. },
	citeulike-article-id = {100669},
	comment = {Gives detailed description of the "Locally Linear Embedding" algorithm. Compares to Isomap and others, shows where it fails.

LLE does the following -- express each point as a convex combination of k neighbours such as squared error is minimal, call this representation Z. Linear problem. Now restrict number of dimensions to d and find coordinates for the points such that representation Z is more or less preserved. Also linear problem (after some unobvious linear algebra). This image in d dimensions is what the "d-dimensional locally linear" manifold of the data looks like. Due to nearest neighbour search, this would be O(n^2), but he gives impressive examples of this applied to thousands of points. Good pictures.},
	journal = {Journal of Machine Learning Research},
	keywords = {manifolds, unsupervised},
	pages = {119--155},
	priority = {0},
	title = {Think Globally, Fit Locally: Unsupervised Learning of Low Dimensional Manifolds.},
	url = {http://jmlr.csail.mit.edu/},
	volume = {4},
	year = {2003}
}


@article{citeulike:100668,
	author = {Ay, Nihat  },
	citeulike-article-id = {100668},
	comment = {Very nice diagram of the set of factorizable (through independence) distributions embedded in the set of all distributions.

Apparently biologically motivated "Infomax principle" makes probability distributions with highest degreee of dependence relevant. The paper formulates the process of finding low dimensional probability distributions with high degree of dependence in information geometric terms (find points as far as possible from the "factorizable" manifold)},
	journal = {Ann. Probab.},
	keywords = {information-geometry, statistics},
	number = {1},
	pages = {416--436},
	priority = {2},
	title = {An Information-Geometric Approach to a Theory of Pragmatic Structuring},
	url = {http://projecteuclid.org/Dienst/UI/1.0/Summarize/euclid.aop/1020107773},
	volume = {30},
	year = {2002}
}


@article{citeulike:93541,
	abstract = {A central problem in science is deciding among competing explanations of data containing random errors. We argue that assessing the "complexity" of explanations is essential to a theoretically well-founded model selection procedure. We formulate model complexity in terms of the geometry of the space of probability distributions. Geometric complexity provides a clear intuitive understanding of several extant notions of model complexity. This approach allows us to reconceptualize the model selection problem as one of counting explanations that lie close to the "truth." We demonstrate the usefulness of the approach by applying it to the recovery of models in psychophysics.},
	address = {Department of Psychology, Ohio State University, 1885 Neil Avenue, Columbus, OH 43210-1222, USA. myung.1@osu.edu},
	author = {Myung, I. J.  and Balasubramanian, V.  and Pitt, M. A. },
	citeulike-article-id = {93541},
	comment = {A very good article if you are new to Information Geometry. Relying on euclidian geometry of the parameter space and using standard (Jordan) measure to find volume doesn't work to compare model complexities of different model dimensions because model with n+1 parameters becomes infinitely more complex than a model of n parameters. Using metrics like BIC is deficient because it can't tell apart between two models of the same dimension. Finally if one reparametrizes the model, you could get a different measure if you rely on the parameter space. The paper shows how all three problems can be addressed by using Fisher Information Measure. It motivates the process of measuring as a process of "counting" statistically distinguishable distribution, since "counts" are dimensionless, we can compare models of different dimensionality.

After motivating Fisher information, it shows how to derive a model selection principle, which ends up similar to Rissanen's stochastic complexity, and shows how to explain BIC and SC in terms of information geometry

Good on theory but no (real) experimental results.},
	doi = {10.1073/pnas.170283897},
	issn = {0027-8424},
	journal = {Proc Natl Acad Sci U S A},
	keywords = {information-geometry, model-selection, statistics},
	month = {October},
	number = {21},
	pages = {11170--11175},
	priority = {0},
	title = {Counting probability distributions: differential geometry and model selection.},
	url = {http://dx.doi.org/10.1073/pnas.170283897},
	volume = {97},
	year = {2000}
}


@article{citeulike:94597,
	abstract = {Some simple heuristic properties of conditional independence are shown to form a conceptual framework for much of the theory of statistical inference. This framework is illustrated by an examination of the role of conditional independence in several diverse areas of the field of statistics. Topics covered include sufficiency and ancillarity, parameter identification, causal inference, prediction sufficiency, data selection mechanisms, invariant statistical models and a subjectivist approach to model-building.},
	author = {Dawid, A. P. },
	citeulike-article-id = {94597},
	keywords = {statistics},
	priority = {2},
	title = {Conditional Independence in Statistical Theory},
	url = {http://links.jstor.org/sici?sici=0035-9246\%281979\%2941\%3A1\%3C1\%3ACIIST\%3E2.0.CO\%3B2-T}
}


@techreport{citeulike:100221,
	abstract = {We propose a family of learning algorithms based on a new form of regularization that allows us to exploit the geometry of the marginal distribution. We focus on a semi-supervised framework that incorporates labeled and unlabeled data in a general-purpose learner. Some transductive graph learning algorithms and standard methods including Support Vector Machines and Regularized Least Squares can be obtained as special cases. We utilize properties of Reproducing Kernel Hilbert spaces to prove new Representer theorems that provide theoretical basis for the algorithms. As a result (in contrast to purely graph based approaches) we obtain a natural out-of-sample extension to novel examples and so are able to handle both transductive and truly semi-supervised settings. We present experimental evidence suggesting that our semi-supervised algorithms are able to use unlabeled data effectively. Finally we have a brief discussion of unsupervised and fully supervised learning within our general framework.},
	author = {Belkin, Mikhail   and Niyogi, Partha   and Sindhwani, Vikas  },
	citeulike-article-id = {100221},
	keywords = {information-geometry, machine-learning, manifolds},
	month = {August},
	organization = {University of Chicago},
	priority = {2},
	title = {Manifold Regularization: A Geometric Framework for Learning from Examples},
	url = {http://www.cs.uchicago.edu/research/publications/techreports/TR-2004-06},
	year = {2004}
}


@book{citeulike:100214,
	author = {Brown, L. D. },
	citeulike-article-id = {100214},
	isbn = {0940600102},
	keywords = {exponential-families},
	priority = {2},
	publisher = {Institute of Mathematical Statistics},
	title = {Fundamentals of statistical exponential families: with applications in statistical decision theory},
	url = {http://portal.acm.org/citation.cfm?id=41464},
	year = {1986}
}


@inproceedings{citeulike:99893,
	author = {Klein, Dan   and Manning, Chris  },
	citeulike-article-id = {99893},
	comment = {The idea is to come up with special graph structures (abstractions) on which inference is more efficient, and at the same time forms an admissible heuristic for A* search. Shows an improvement of trillion times over standard dynamic programming on multiple sequence matching problem. Not clear how to apply to POS tagging},
	journal = {Eighteenth International Joint Conference on Artificial Intelligence (IJCAI-03)},
	keywords = {crf, mlrg},
	priority = {0},
	title = {Factored A* Search for Models over Sequences and Trees},
	url = {http://www.cs.berkeley.edu/~klein/papers/factored-astar.pdf},
	year = {2003}
}


@misc{citeulike:99892,
	abstract = {The method of maximum entropy (ME) is extended to address the following
problem: Once one accepts that the ME distribution is to be preferred over all
others, the question is to what extent are distributions with lower entropy
supposed to be ruled out. Two applications are given. The first is to the
theory of thermodynamic fluctuations. The formulation is exact, covariant under
changes of coordinates, and allows fluctuations of both the extensive and the
conjugate intensive variables. The second application is to the construction of
an objective prior for Bayesian inference. The prior obtained by following the
ME method to its inevitable conclusion turns out to be a special case of what
are currently known under the name of entropic priors.},
	author = {Caticha, Ariel  },
	citeulike-article-id = {99892},
	comment = {Introduces motivation for entropic prior, motivates Maximum Entropy (ME not MaxEnt) principle.

The idea is that we usually have a prior over data, but not prior over parameters of the model. For discrete data, our uninformative prior is p(k)=1/n where n is number of choices. Entropic prior tells us how to get the "least assumptive" prior over parameters given our prior over data (ie, uniform). This "least assumptive" prior will favour higher entropy distributions, hence the name. Since higher entropy means "more simple" favouring higher entropy distributions coincides with our intuition.

My slides from presenting the paper: http://web.engr.oregonstate.edu/~bulatov/research/reports/entropic\_prior/Maximum\%20Entropy\%20and\%20Priors\_files/frame.htm},
	eprint = {math-ph/0008017},
	keywords = {information-geometry, maxent},
	month = {August},
	priority = {0},
	title = {Maximum entropy, fluctuations and priors},
	url = {http://arxiv.org/abs/math-ph/0008017},
	year = {2000}
}