\documentclass[11pt]{article}
\bibliographystyle{plain}
\usepackage{amssymb}
\usepackage{times}
%\usepackage{doublespace}
\thispagestyle{empty}
\newcommand{\mse}{mean-square error }
%TPAMI-0029-0403
\newcommand{\hide}[1]{}
\newcommand{\out}[1]{}

\newcommand{\ui}{^{(i)}}
\newcommand{\us}{^{(s)}}
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\beqa}{\begin{eqnarray}}
\newcommand{\eeqa}{\end{eqnarray}}

\newcommand{\ie} {{\it i.e., }}
\newcommand{\eg} {{\it e.g., }}

\newcommand{\cl}[1]{{{\cal{#1}}}}

\newcommand{\mr}[1]{{\mathrm{#1}}}
\newcommand{\mb}[1]{{\mathbf{#1}}}


\newcommand{\mycaption}[3]{\renewcommand{\baselinestretch}{1}\caption[#1]{#2.}{#3}\renewcommand{\baselinestretch}{1.5}}

\newcommand{\mycaptionS}[1]{\renewcommand{\baselinestretch}{1}\caption[#1]{\small #1}\renewcommand{\baselinestretch}{1.5}}

\newcommand{\CapMViewc}{{Estimating same hand pose at $26$ viewpoints. The feedback function used was estimated from data. The figure has two sets of columns. Each column has the ground truth, MO, and best three MS samples. The viewpoint $(\beta_1,\beta_2)$ is indicated on the right side of each column}}


\newcommand{\CapMViewb}{{Example estimated hand poses at random view points obtained using the MS algorithm. Feedback function was estimated from data. Columns 1-2 show the ground truth and the estimate using the MO algorithm, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

\newcommand{\CapMViewcE}{{Estimating same hand pose at $26$ viewpoints. The feedback function used was the computer graphics rendering. The figure has two sets of columns. Each column has the ground truth, MO, and best three MS samples. The viewpoint $(\beta_1,beta_2)$ is indicated on the right side of each column}}

\newcommand{\CapMViewbE}{{Example estimated hand poses at random view points obtained using the MS approach. Feedback function was computer graphics rendering. Columns 1-2 show the ground truth and the estimate using the MO algorithm, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

%-----

\newcommand{\CapTestI}{{40 examples of estimated hand poses chosen uniformly at random. Reconstruction found using the Mean Output (MO) approach. The feedback function used was estimated from data. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}

\newcommand{\CapTestIE}{{40 examples of estimated hand poses chosen uniformly at random. Reconstruction found using the Mean Output (MO) approach. The feedback function was computed using computer graphics rendering. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom). For comparison, the frames  are the same as those used when feedback was estimated from data}}

\newcommand{\CapRTestI}{{40 examples of estimated hand poses captured every 0.9 secs. from real video (RV). Reconstruction found using the Mean Output (MO) approach. The feedback function used was estimated from data. }} %Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}

\newcommand{\CapRTestIE}{{40 examples of estimated hand poses captured every 0.9 secs from real video (RV). Reconstruction found using the Mean Output (MO) approach. The feedback function was computed using computer graphics rendering}} %Each example consists of a pair of images: input video frame (top), and estimate obtained using the mean output algorithm (bottom).  Note: for comparison frames are same as those used when feedback was estimated from data}}

\newcommand{\CapTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function  was estimated from data. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\newcommand{\CapTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was computed using computer graphics rendering. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

\newcommand{\CapRTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach using real video (RV). The feedback function was estimated from data}} %%. Frames were chosen every 0.9 secs. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

\newcommand{\CapRTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach and real video (RV). The feedback function was computed using computer graphics rendering}} %%. Frames were chosen every 0.9 secs. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\newcommand{\CapTestIIWE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was computed using computer graphics rendering. Column 1 shows ground truth, columns 2-6 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}



%%%% ----- Multiple

\newcommand{\CapMTestI}{{40 examples of estimated hand poses chosen uniformly at random and reconstruction found using Mean Output (MO) approach. The feedback function used was estimated from data. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}

\newcommand{\CapMTestIE}{{40 examples of estimated hand poses chosen uniformly at random and reconstruction found using Mean Output (MO) approach. The feedback function was computed using computer graphics rendering. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom). Note: for comparison frames are same as those used when feedback was estimated from data}}


\newcommand{\CapMTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. Views and poses were chosen uniformly at random. The feedback function was estimated from data. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\newcommand{\CapMTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. Views and poses were chosen uniformly at random. The feedback function was computed using computer graphics rendering.  Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}



\newcommand{\CapRTestBodyII}{{Example estimated body poses obtained using the Multiple Sample (MS) approach using real video (RV). The feedback function was estimated from data. Frames were chosen every $\frac{2}{3}$ secs. Column 1 shows the input video frame, columns 2-6 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\usepackage{psfig}
%\pssilent
%\renewcommand{\psfig}[1]{}

\renewcommand{\dbltopfraction}{1.0}
\renewcommand{\textfraction}{0.0}
\renewcommand{\topfraction}{1.0}
\renewcommand{\bottomfraction}{1.0}
\renewcommand{\baselinestretch}{1.5}
\def\changed#1{{\bf{#1}}}
%\def\changed#1{#1}
%\psdraft
\newcommand{\comment}[1]{{\large\it #1}}
\setlength{\textwidth}{6.5in} \setlength{\textheight}{9.0in}
\setlength{\oddsidemargin}{0.0in} %\setlength{\topmargin}{-0.5in}
\begin{document}
\thispagestyle{empty}

%\title{A generative/discriminative framework for estimating articulated body pose from a single image}
\title{{\vspace*{-1in}\normalsize{\tt Submitted 2003. Portions of this paper appeared in NIPS 14 and ICCV 01}}\\
{\sf\LARGE Combining generative and discriminative models for inferring articulated body pose from a single image}}
%~\\
%~\\
%~\\
%}
\author{\begin{tabular}{cc}
R\'{o}mer Rosales  & Stan Sclaroff\\
Probabilistic and Statistical Inference Group& Image and Video
Computing Group \\
Dept.\ of Electrical and Computer Engineering & Dept.\ of Computer
Science \\
 University of Toronto & Boston University \\
 Toronto, ON M5S 3G4 CANADA & Boston, MA 02215 USA\\
romer@psi.toronto.edu & sclaroff@cs.bu.edu
\end{tabular}}
\date{~}
%\date{Version of \today}
\renewcommand{\baselinestretch}{1.}
\maketitle \thispagestyle{empty}
\renewcommand{\baselinestretch}{1.5}



%without the need for manual initialization, non-linear optimization.
% the camera perspective and
%orthogonal projection models.


%% \begin{abstract}
%% A probabilistic, nonlinear supervised learning model is proposed:
%% the Specialized Mappings Architecture (SMA).  The SMA employs a
%% set of several mapping functions that are estimated automatically
%% from training data. Each specialized function maps certain domains
%% of the input space (e.g., image features) onto the output space
%% (e.g., articulated body parameters). One important advantage of
%% the SMA is that it can model ambiguous, one-to-many mappings that
%% may yield multiple valid output hypotheses. Once learned, the
%% mapping functions generate a set of output hypotheses for a given
%% input via a statistical inference procedure. The SMA inference
%% procedure incorporates an inverse mapping or feedback function,
%% which enables the SMA to evaluate the likelihood of each
%% hypothesis. Possible feedback functions include computer graphics
%% rendering routines that can generate images for given hypotheses.
%% The SMA employs a variant of the Expectation-Maximization
%% algorithm for simultaneous learning of the specialized domains
%% along with the mapping functions, and approximate strategies for
%% inference. The framework is demonstrated in a computer vision
%% system that can estimate the articulated pose parameters of a
%% human body or human hands, given image silhouettes. The accuracy
%% and stability of the SMA are also tested using synthetic images of
%% human bodies and hands, where ground truth is known.

%% \end{abstract}

%which allowed us to derive inference
%methods based on the possibility of alternatively use different sets
%of conditional independence assumptions specified by the forward and
%inverse models. The inverse function

\begin{abstract}
We consider the problem of 3D and 2D articulated body pose estimation/inference from visual features from a single image. 3D pose estimation from a single image is generally regarded as ill-posed. We formulate this problem as a statistical inference problem, where the goal is to find a posterior probability distribution over poses. Statistical modelling allows us to provide a well-defined formulation and also to employ additional information in the form of labelled body poses and their respective image features. Generative models offer a principled way of accounting for hidden random variables (body pose). However, while we can define an accurate generative model for this problem, inference is intractable. On the other hand, discriminative models can be introduced where inference is tractable. Unfortunately, these models are considerably less accurate for the problem of interest, since it is not clear how to build them (\ie find a probability distribution that captures the structure of the problem). These two viewpoints are complementary and thus an ideal approach should exploit their individual advantages. We provide a natural and principled way to combine these models. In our approach, a discriminative model is learned from training data. This model is not directly used but combined with the generative model, obtaining better approximations to the intractable posterior distribution implied by the latter. We offer theoretical justification for the resulting inference algorithm and also provide two algorithms for MAP estimation that are very efficient and have clear advantages over standard body tracking methods. Performance is thoroughly evaluated using synthetic and real visual data for estimating hand and human body pose. 

\hide{We consider the problem of 3D and 2D articulated body pose
estimation/inference from visual features obtained from a single
image. 3D pose estimation (\eg MAP estimation) is generally considered
ill-posed since one cannot fully recover the body pose due that
information is lost after projection to the image plane. Here we
formulate this problem as a statistical inference problem, where the
goal is to find a posterior probability distribution over poses given
features from a single image. Statistical learning can be accomplished
by using labelled training data of body poses and their respective
image features.

Generative models offer a principled way of accounting for hidden
random variables (body pose). However, despite the fact that we can
define an accurate generative model for this problem, inference is
intractable because of the complex non-linear generative process. On
the other hand, we can introduce discriminative models where inference
is tractable. Unfortunately, these models are considerably less
accurate for the problem of interest, since it not clear how to build
appropriate discriminative models for the problem at hand (\ie a
probability distribution that captures the structure of the
problem). These two viewpoints are complementary and thus an ideal
approach should exploit their individual advantages to make inference
both accurate and feasible. 

We provide a natural and principled way to combine these models. In
our approach, a discriminative model is learned from training
data. Unlike simply using it to define a posterior distribution over
body poses, the discriminative model is combined with the generative
model to better approximate the intractable posterior distribution
implied by the generative model. We offer theoretical justification
for the resulting inference algorithm and also, provide two algorithms
for MAP estimation that are efficient and have clear advantages over
standard body tracking methods. Performance is thoroughly evaluated
using synthetic and real visual data in the tasks of estimating hand
and human body pose. We show that, even though one image seems
insufficient to recover body pose, our method clearly provides
accurate estimates using a very fast algorithm.}
%To do:
%Refer to Zhu, and others, related in the methodology.
%Check the results and address 'bad' results comments
%Check other comments
%Fix abstract
%%Check the KL stuff and later paragraphs in the critical section
%Pedro Felzenswalb
\hide{
Stan: There are two things I still need to address 

(1)add some related work, reviewers wanted us to add Zhu,Felzenswalb,Jojic-Frey...
(2)Address the 'bad' results comments
 I am running some experiments to get the absolute joint error per joint instead of RMSE (divided by the # DOF) as I did for the previous paper. I realized that RMSE indeed may be a bad way to show the results. I am also going to say what the random performance is (to avoid the comments like, 'some joints are estimated at around chance')
For a final version, we need to
(*) Spell check, delete 'regions' in figures
(**) Take out 5 pages (any suggestions)

The largest change were Sec. 5.1 -- 5.4, they were practically re-written.
}
\hide{

We provide a theoretical justification for this.... [Jaak 98]
Incorporating.... [J98]

A probabilistic, nonlinear supervised learning model is proposed: the
Specialized Mappings Architecture (SMA).  The SMA employs a set of
several forward mapping functions that are estimated automatically
from training data. Each specialized function maps certain domains of
the input space (e.g., image features) onto the output space (e.g.,
articulated body parameters). The SMA can model ambiguous, one-to-many
mappings that may yield multiple valid output hypotheses. Once
learned, the mapping functions generate a set of output hypotheses for
a given input via a statistical inference procedure. The SMA inference
procedure incorporates an inverse mapping or feedback function in
evaluating the likelihood of each of the hypothesis. Possible feedback
functions include computer graphics rendering routines that can
generate images for given hypotheses.  The SMA employs a variant of
the Expectation-Maximization algorithm for simultaneous learning of
the specialized domains along with the mapping functions, and
approximate strategies for inference. The framework is demonstrated in
a computer vision system that can estimate the articulated pose
parameters of a human's body or hands, given silhouettes from a single
image. The accuracy and stability of the SMA are also tested using
synthetic images of human bodies and hands, where ground truth is
known.}
%\hide{In the SMA
%formulation it is possible to use different sets of conditional
%independence assumptions in the forward and inverse models if
%desired.}
%In both
%tests, excellent performance is attained.
%SSChanged: Commented out last sentence...
%% It's just begging for abuse from the reviewers.
%% Let the reader be the judge please.
%% RR:OK
%% Reworded the abstract a little.  The abstract already says what's important.
%% One need not say things like "An important aspect of the approach...."  etc.
%% RR: After the clarification at the beginning of my email, you'll see that this is not correct:
% 'In the SMA formulation
%it is possible to use different sets of conditional independence
%assumptions in the forward and inverse models if desired.'
% That's why I prefer:
%'It incorporates an inverse
%mapping or feedback function, which allowed us to derive inference
%methods based on the possibility of alternatively use different sets
%of conditional independence assumptions specified by the forward and
%inverse models. The inverse function enables the SMA to evaluate the
%likelihood of each of the hypothesis.'
%% The knowledge of the inverse function allowed us to use both sets of CIA's (at the same time for interence).
%% Otherwise SMA would have been like most ML methods
\end{abstract}
\paragraph{Keywords:} Human Body Pose, approximate statistical inference, combination of generative and discriminative models, supervised learning, statistical inference, estimation of articulated structure,  Expectation Maximization algorithm, recovery of 3D from 2D.
%\paragraph{Keywords:} Human  Body Pose, estimation of articulated structure, supervised learning, combination of generative and discriminative models, statistical inference, Expectation Maximization algorithm, hand shape.
\newpage
%%%%%%%%%%%\renewcommand{\psfig}[1]{}
\section{Introduction}

\hide{ An essential task for vision systems is to infer the state of
the world given some form of visual observations. From a computational
perspective, this often involves facing an ill-posed problem; for
example, relevant information may be lost via projection of the
three-dimensional world into a two-dimensional image. As a result, it
is often the case that multiple valid interpretations of an image are
possible.  Solving an ill-posed problem requires some form of
additional information, usually provided as a model of the underlying
process. Interestingly, in their day to day life, humans are
surprisingly adept at interpreting the visual world.  }

One fundamental vision problem is that of inferring or estimating the
underlying 3D attributes of a real world object, based on its 2D
projection onto a camera. From a computational perspective, this
involves facing an ill-posed problem; relevant information is lost via
projection of the three-dimensional world into a two-dimensional
image. In this paper we will focus on non-rigid articulated objects,
in particular on human body pose and also hand configuration. Humans
can often solve these problems, even when given only a relatively
low-resolution, monocular images of the world, e.g., a photograph.  It
is believed that humans employ extensive prior knowledge about human
body structure and motion in this task \cite{Johansson73}.  Assuming
this, in this paper we will consider how a computer might learn the
underlying {\it knowledge} in the form of a probabilistic model, and
thereby infer pose from a single image.

%% R_Nov_Change: Took out this paragraph
%% Let us consider an example body pose inference task: given only a
%% person's silhouette, estimate that person's articulated body pose.  To
%% be concrete, let us define articulated pose in terms of: (a) the 2D
%% locations of the person's joints in the image, or (b) the 3D locations
%% of the person's joints in Euclidean space. Imagine drawing marks on
%% the silhouette image that approximately label the joints: left elbow,
%% right elbow, left knee, right knee, and so on. Also consider a
%% plausible 3D pose interpretation for this silhouette.  While this
%% inference task seems relatively simple for a human to perform, the
%% task is quite challenging, using either representation (a) or (b), for
%% current computer vision systems.

%RRChange ...  using either representat
%SS: OK
% An example image is shown in Fig.\ \ref{fig:exampleTask}.
\changed{For purposes of computation, the above task can be defined as
follows: given an observation vector $\mb{x}\in \Re^c$ that was
extracted from an image of a person, infer the parameterized
articulated pose as a vector $\mb{h} \in \Re^t$}. These
\changed{vector} spaces ${\Re}^c$ and ${\Re}^t$ are
continuous. \changed{In a very generic machine learning framework,
inference might be regarded as a function $\varphi:{\Re}^c
\rightarrow{\Re}^t$ that for a given input (or observation) computes
as output a single pose (\eg the most likely pose according to some
measure) or more generally a pose posterior probability distribution
. The latter would lead to a different definition of $\varphi:{\Re}^c
\rightarrow {\cl P}$, where ${\cl P}$ is a family of probability
density functions on $\Re^t$. There are many different aspects in this
problem. Some of them have been the focus of a lot of attention in
statistical learning. They are by no means solved except for certain
basic instances (\eg see \cite{Pearl88}). For many real-world problems
this is usually not the case. In articulated pose estimation, a number
of open general issues immediately appear: how to select the
appropriate type or form for this function (\eg we may have reasons to
use a discriminative instead of a generative model\footnote{The term
{\it inference} is used mainly in the context of generative models;
however, in this section we consider a broader usage by employing it
in the context of discriminative models also.}), how to take advantage
of the problem structure (\eg prior knowledge for modeling), how to
estimate (learn) this mapping from data, and how to perform inference
efficiently or approximately if exact inference is intractable (\eg
how to make use of what was learned from data). Some of these
questions are specific of the problem at hand, while the others are
fundamental and common in statistical learning.}
%%RRChange ... a single pose ...
%%SS: OK

\psfigurepath{./figs}
\begin{figure}[t]
\centerline{
\psfig{figure=Intro3w.ps,width=0.6\textwidth,clip=t}
}
\mycaptionS{\small Example ambiguity in mapping
body silhouette cues in ${\Re}^c$ to articulated body poses in
${\Re}^t$. Given silhouette $\mb{x}$, poses $\mb{a}$--$\mb{h}$
are all valid hypotheses.  In general,
entire regions in ${\Re}^t$ may contain valid poses.
\label{fig:mappingAmbiguity}}
\end{figure}

If we try to learn a mapping directly, let us say by estimating the
parameters of a parameterized function $\phi:{\Re}^c
\rightarrow{\Re}^t$ as in a discriminative approach, we encounter
several problems. The form required for $\phi$ may not be simple,
because the mapping from observations (\eg an image) to articulated
poses is generally ambiguous (one-to-many). In fact no single function
can perform this mapping. An example is illustrated in
Fig.~\ref{fig:mappingAmbiguity}, the arm locations cannot be uniquely
inferred given the silhouette $\mb{x}$; therefore, $\mb{a}$--$\mb{h}$
are all possible pose configurations (the arms can move in such a way
that the silhouette does not change). Note also that pose $\mb{c}$ is
the reflection of $\mb{a}$: the camera looks at the back rather than
at the front of the body. There might be an infinite number of valid
poses for a particular input. Moreover, regions of valid poses need
not be connected in $\Re^t$. For instance, different regions in
$\Re^t$ may correspond to ranges of valid poses, \eg some viewed from
the front and others from behind. Such ambiguities are not particular
to human body pose; for instance, analogous inference problems exist
in estimating hand pose from image features, as will be seen
later. Even though one may be tempted to just increase the complexity
of this function $\phi$ and consider this choice as necessary (due to
the apparent intricacy of the problem at hand)
\footnote{Moreover, unnecessarily increasing the complexity of $\phi$
can have other awful consequences such as overfitting.}, a fundamental
idea in this paper is that this choice may not be necessary, as will
be seen next.

%Complexity, and doesn't include knowledge
%%RRChange ...In fact
%%SS: OK

Let us now consider the inverse problem: given an articulated pose
vector $\mb{a}$, generate its silhouette $\mb{x}$. With a good
computer graphics model of the human body, one can easily render the
silhouette $\mb{x}$. Thus, we can easily compute what we refer to as
the inverse mapping $\zeta:{\Re}^t\rightarrow{\Re}^c$ (note that
despite the simplicity of $\zeta$, its inverse may still be complex or
not even exist). Other real world problems share the property that
their inverse problem is simpler, e.g., speech recognition (after some
parameters are given, such as pitch). In fact, this property is a key
part of our problem definition and it will play an important role in
developing the framework presented in this paper. The argument is that
the inverse function $\zeta$ provides useful information about the
structure of the problem, \changed{ but cannot be incorporated
straightforwardly in a discriminative approach or cannot be use
directly for inference. On the other hand, it might be useless in a
purely generative approach (these approaches are related to tracking):
we have a very accurate way to generate silhouettes from a given pose
configuration; however, this does not guarantee a simple algorithm for
pose inference.}

\changed{ In summary, the mapping of inputs (cues) to outputs (poses)
is ambiguous (\ie one-to-many) and potentially very complex. The
former precludes the use of discriminative supervised learning methods
that fit a single (or finite number of) functions to the data to
produce a pose given the cue (most neural networks, support vector
machines, simple least-squares, boosting, etc). The latter can easily
create computational (space and time complexity for learning) and
modeling drawbacks (such as overfitting). We also have access to the
{\it inverse} map $\zeta:{\Re}^t\rightarrow{\Re}^c$, that can be used
to define a very accurate generative model. However, this accurate
model might not be very useful in terms of finding an algorithm for
estimating the body pose given an input image (inference is
intractable). The view taken in this paper is that it can be effective
to use the individual advantages of these two complimentary approaches
(discriminative and generative) to formulate an efficient solution to
the inference/learning problem.}

%% This access to the {\it inverse} map, as well as the
%% one-to-many forward ambiguity are two of the key characteristics
%% of our problem that make it different from other supervised
%% learning problems. The core algorithmic challenges are: 1.)
%% estimating the specialized domains and functions in an optimal way
%% that also takes into account the form of the specialized
%% functions, and 2.) using the knowledge of the inverse function to
%% formulate efficient inference and learning algorithms.

%%RRChange [added full paragraph]
%%SS: I removed this stuff before, because it's redundant.
%% I remove it again.
%% The first sentence is simply a restatement of the paragraph
%% before it.  And the last sentence of paragraph before that.
%% It simply won't fit.
%RR: Can we somehow state that 'these two are the main characteristics of the problem we are trying to solve which make it different from other supervised learning problems'. It will emphasize that we are doing something different. I think it is important.
%RR:
%This paragraph also states clearly what are the fundamental problems (why it is difficult).
%I think this paragraph is a great summary of the whole machine learning part of the paper. It should be kept somehow.
%% SS: OK, OK.  It's your thesis after all :)
%% But as I point out in my email this paragraph is mostly redundant.

\changed{In this paper, we describe a probabilistic, nonlinear
framework for combining generative and discriminative models for
articulated pose estimation. This approach is general, and thus can be
used in other problems with similar structure. The framework employs a
set of $M$ functions $\phi_k:{\Re}^c \rightarrow{\Re}^t$, each
associated to a mixture component in a mixture distribution. Each
function maps certain sub-domains of the input space (cues) onto the
output space (poses). } \changed{ These functions are estimated
automatically from training data via a variant of the
Expectation-Maximization algorithm. The learned conditional
distribution is then used as an approximation to an accurate
(generative model) distribution defined using the inverse function
$\zeta$, for which inference is intractable. This basic idea is shown
in a schematic way in Fig.~\ref{fig:SMAexample}. The approximation is
employed in a similar way as a proposal distribution is used to
approximate sampling from a more complex distribution.}

%RRChange  .... More importantly w
%%SS: is it really more important than other stuff already in this paragraph?
%% Also, I don't know what you mean at all. It's machine learning after all.
%% you have to pick a functional form.  If you feel it's important, can you
%% try to explain it in email?  It's not really clear what you mean here
%% at all.  I would prefer you leave it out.
%%RR:OK

%\psfigurepath{../ICCV01/iccv01/figs}
\begin{figure}[t]
\centerline{
%(a) \psfig{figure=Learning2.ps,width=0.48\textwidth,clip=t}
%~(b) \psfig{figure=Inference2.ps,width=0.46\textwidth,clip=t}
(a) \psfig{figure=map.GIF.eps,width=0.4325\textwidth,clip=t}
~~~(b) \psfig{figure=fb.GIF.eps,width=0.414\textwidth,clip=t}
}
%\begin{figure}[t]
%\vspace*{2.5in}
\label{fig:SMAexample} \mycaptionS{\small
Schematic illustration behind our method for the case of inferring
body pose: (a) Given an input vector $\mb{x}$, we generate a set of
hypotheses. (b) The inverse mapping function $\zeta$ is employed in
evaluating each hypothesis.}
\end{figure}


\hide{
1111111111111111
\changed{
The basic concepts are illustrated in Fig.\ \ref{fig:SMAexample}. For
a given input $\mb{x}$, the discriminative model generate a set of output
hypotheses. We then exploit the generative model (defined by the
inverse mapping $\zeta$) to evaluate the probability of each
hypothesis.}}

%%SS: adjusted the size (smaller)

%%% RomerV5: took out following paragraph
\hide{An important advantage of this approach is that it can model
ambiguous, one-to-many mappings that may yield multiple valid output
hypotheses. Unlike other learning approaches that employ a set of
mapping functions (\eg \cite{Friedman91,Hinton98,Jordan94}), this
approach incorporates an inverse mapping $\zeta$ in probabilistic
inference. The framework is evaluated in a computer vision system that
can estimate the articulated pose parameters of a human body or human
hands, given real image silhouettes.  Accuracy and stability are also
tested using synthetic images of human bodies and hands, where ground
truth is known.}


%% %% For related work
%% Several other learning models use a similar concept of fitting
%% surfaces to the observed data by splitting the input space into
%% several regions and approximating simpler functions in these regions
%% (\eg \cite{Jordan94,Hinton98,Friedman91}). However, in these
%% approaches, the inverse map is not incorporated in the estimation
%% algorithm because it is not considered in the problem definition and
%% it is necessary to make the forward model more complex.

%% 1111111111111

\section{Related Work}

In computer vision, recovery of articulated body pose from images is
often formulated as a {\it tracking} problem. Usually, link-joint
models comprised of 2D or 3D geometric primitives are designed
beforehand to roughly match the specific morphology of the target in
question
\cite{Bregler98,Deutscher00,Gavrila95,OrmSidBlaHas01,Rehg95,shimada,Felzenszwalb00,Sminchisescu01}.
Mesh models have also been used as an alternative to link-joint models
\cite{heap}. At each frame, these models are fitted to the image to
minimize some cost function that favors the overlap of the model and
associated image regions (or motion). The fitting or cost function is
sometimes implicitly defined and, from our viewpoint, it
can usually be thought of a defining a generative model. Despite their
descriptive power, this family of approaches has a number of critical
drawbacks. Generally, a non-linear optimization problem must be solved
at every frame (sometimes equivalent to inference in a complex
generative model). Careful manual placement of the model on the first
frame in a video sequence is also required.  Moreover, tracking in
subsequent frames tends to be sensitive to errors in initialization
and numerical drift; as a result, these systems cannot recover from
tracking errors in the middle of a sequence.

To address these weaknesses, specialized dynamical models have
been proposed \cite{Isard98J,OrmSidBlaHas01,PavRehMac01}.  These
methods learn a prior distribution over some specific motion
class, such as walking. This prior is used to predict and
hopefully improve the pose estimates in future frames. However,
this strong prior substantially limits the generality of the
motions that can be tracked; a prior for a given class of motions
is generally useless when used for tracking objects undergoing a
different class of motion, e.g., walking vs. dancing.

Other methods for constrained tracking include
\cite{Black95}, where a subspace of allowable motions is
learned from a set of examples. These examples and the model
(usually linear) are hoped to be sufficient to span the set of
possible motions to be seen during tracking. Thus, pose inference
involves finding a linear projection of the observed data onto the
motion subspace. This subspace approach enforces a strong prior;
as mentioned previously, this limits the generalization of the
model to classes of motions not seen in the training set.
Furthermore, articulated motion is generally non-linear, and
cannot be easily explained as a linear projection.




In our approach we avoid matching image features (e.g., image
regions, points, or articulated models) from frame to frame.
Therefore, we do not refer to our approach as {\it tracking}, per
se. This is in direct contrast with the techniques mentioned
above.  A number of other approaches also depart from the
aforementioned tracking paradigm. We summarize these next.

In \cite{Howe99} a statistical approach is employed in
reconstructing the 3D motions of a human figure. The approach
employs a Gaussian probability model for short human motion
sequences. It is assumed that 2D tracking of the joint positions
in the image is given; therefore, this assumption implicitly
incurs the restrictions found in all tracking approaches.

In \cite{Perona00} dynamic programming is used to calculate the best
global matching of image points to predefined body joints, given a
learned probability density function of the position and velocity of
body features. Although not explicitly mentioned by the authors, the
probability function is defined by a triangulated acyclic graph. Thus,
inference is feasible due to the running intersection property
\cite{Jordan99,Pearl88}.  Still, in this approach, the image points
and model initialization must be provided by hand or through some
other method.

In \cite{Brand99}, the manifold of human body dynamics is modeled
via a hidden Markov model with an entropic prior. Once the states
are inferred from observations, a quadratic cost function is used
to generate a continuous path in configuration space, \ie body
pose space.

In all of the non-tracking approaches just referred, models of {\em
motion} were estimated from data. Although the approach presented in
this paper can be used to model dynamics, we argue that when general
human motion dynamics are to be learned, the amount of training data,
model complexity, and computational resources required are
impractical. As a consequence, models with unacceptably large priors
towards specific motions are generated. Although by not modeling the
dynamics we may be ignoring information that could be used to further
constrain the inference process, there are some benefits. For
instance, a model for inferring body pose that does not consider
dynamics provides invariance with respect to speed (\ie sampling
differences) and direction in which motions are performed. This
happens simply because this model treats configurations as temporally
independent of each other. Other approaches that use a single image
include \cite{Kakadiaris00,Haritaoglu98a,Taylor00}; however, most of
these methods also require that projected joint locations be given as
input. In our approach this is not necessary.
%Lee85 erased orourke80 out

Our approach maps visual features to likely body configurations.
Following a machine learning paradigm, stochastic functions that map
visual features to pose parameters are approximated from training
data. A unique aspect of our approach is the combined use of (1) these
mapping functions (defining a discriminative model) with (2) the
inverse mapping function $\zeta$ (defining a generative model). After
multiple poses have been inferred from just the visual cues, $\zeta$
transforms these pose configurations back to the visual cue
(observation) space. In this space, we can then automatically choose
among a set of reconstruction hypotheses. This is a fully
probabilistic inference process. Our approach avoids the need for
manual initialization or tracking; it thereby avoids the consequent
disadvantages of tracking. Remarkably, relatively few computations are
required for inference. We will now formalize and explain our approach
in detail.
%RRChange .This is a ...
%% SS: OK


\renewcommand\arraystretch{0.8} %% SS: This changes separation between table rows

\begin{table}[t] {\small
\begin{tabular}{|ll|}
\hline
number of training examples & $N$\\
training set & $\cl{Z}=\{\mb{z}_1,...,\mb{z}_N\}$ \\
training example (input,output) pair & $\mb{z}_i = (\upsilon_i,\psi_i)$ \\
input (feature) training vector & $\upsilon_i \in \Re^c$ \\
output (pose) training vector & $\psi_i \in \Re^t$\\
\hline
generative and discriminative models probability distributions & $p$,$q$ (respectively)\\
observation random variable (\eg image moments) & $\mb{x} \in {\Re}^c$\\
hidden random variable of pose parameters        & $\mb{h} \in{\Re}^t $\\
feedback (rendering) function (for generative model)&$\zeta:{\Re}^t\rightarrow{\Re}^c$\\
\hline
number of samples during inference& $S$\\
a particular observation or input image feature & $\mb{x}^*$\\
output (pose) hypothesis ( a sample from $q(\mb{h}|\mb{x}^*)$)& $\mb{h}_k$\\
estimate of most likely output hypothesis & $\hat{\mb{h}}$\\
\hline
%Mapping functions (one for each mixture distribution component)& $\Phi = \{\phi_1,\dots,\phi_M\}$\\
discrete set of labels for mixture components&${\cal C}=\{1,\dots,M\}$\\
hidden random variables assigning mixture component to training samples & $\mb{y}=(y_1,\dots,y_N), y_i\in{\cal C}$ \\
prior probability of mixture component $k$ will be used & $\lambda_k = Q(y=k)$\\mapping function parameter vector & $\theta_k$\\
discriminative model parameters (to be learned) & $\theta=(\theta_1,\dots,\theta_M,\lambda)$\\
posterior probability of $k$-th mixture component for $\mb{z}_i$ during EM& $\tilde{Q}(y_i=k)=Q(y_i=k|\psi_i,\upsilon_i,\theta)$ \\
\hline
\end{tabular}}
\mycaptionS{Some mathematical symbols used in this paper.} \label{tab:symbols}
\end{table}



\section{Probabilistic Models}
\label{sec:ProMod}

\changed{We will now formally define both, the discriminative and
generative models to be employed. The discriminative model will be
estimated from data and the generative model will be defined by the
inverse function $\zeta$. They represent two views of the same problem
and will be used together in this framework to provide a solution to
inferring body pose from a single image.}

\subsection{The Discriminative Model}
\hide{
In our approach, the discriminative model is represented by a set
of mapping functions. These functions are estimated from training data,
via a supervised learning procedure. }

Let $\cl{Z}=\{\mb{z}_1,...,\mb{z}_N\}$ be an observed training set of
input-output pairs $\mb{z}_i = (\upsilon_i,\psi_i)$.  Each $\upsilon_i
\in \Re^c$ is an input (feature) vector, and each $\psi_i \in \Re^t$
is its corresponding output (pose) vector. A summary of mathematical
symbols used in this formulation is provided in Table
\ref{tab:symbols}.

We will approach our forward problem as one of hidden variable density
estimation. We begin by introducing the unobserved random variable
$\mb{y}=(y_1,\dots,y_N)$. In our model any $y_i$ has as its domain the
discrete set $\cl{C}=\{1,\dots,M\}$ of labels for the specialized
mapping functions, and can be thought of as the function number used
to map the $i$-th training pair, $\mb{z}_i$.  Thus $M$ is the number
of specialized mapping functions. Our model uses parameters
$\theta=(\theta_1,\dots,\theta_M,\lambda)$, where $\theta_k$
represents the parameters of the $k$-th mapping function, and
$\lambda=(\lambda_1,\dots,\lambda_M)$, where $\lambda_k$ represents
$Q(y=k)$, the prior probability that the mapping function with label
$k$ will be used to map an input-output pair.

Taking a maximum-likelihood viewpoint, we are interested in
finding the optimal parameter settings for our model; thus, we
seek to maximize the joint log-probability:
%\footnote{This is almost
%identical to taking a MAP estimate viewpoint and considering the
%parameters $\theta$ as random variables with uniform prior in some
%(bounded) interval}
\begin{equation}
\label{eq:LeaDisMod}
\theta^* = \arg\max_\theta \log q(\cl{Z}|\theta).
\end{equation}
Assuming independence of observations given $\theta$, and using
Bayes' rule we obtain:
\begin{eqnarray}
\theta^*&=&\arg\max_\theta \sum_i \log q(\mb{z}_i|\theta)\\
%&=&\arg\max_\theta \sum_i \log \sum_k q(\mb{z}_i|y_i=k,\theta)
%Q(y_i=k|\theta)\\ \label{eq:OptEq} 
&=& \arg\max_\theta \sum_i \log
\sum_k q(\psi_i|\upsilon_i,y_i=k,\theta)Q(y_i=k|\theta)
q(\upsilon_i), \label{eq:LogSum}
\end{eqnarray}
where we used the independence assumption
$q(\upsilon|\theta)=q(\upsilon)$. \changed{The term $q(\upsilon_i)$
describes how input patterns occur in the world. For solving
Eq.~\ref{eq:LogSum}, it is approximated by the empirical distribution
implied by our training data; as a consequence, patterns that occur
more often will have a larger effect in the maximization of
Eq.~\ref{eq:LogSum}.} Due to the sum of terms inside the logarithm of
Eq.~\ref{eq:LogSum}, this optimization is generally intractable.
However, a variety of practical approximate optimization methods
exist, for example, methods that are based on alternating
minimizations \cite{Csiszar84}. An Expectation Maximization (EM)
\cite{Dempster77,Neal98} method is described in Sec.\ \ref{sec:Lea}.



\subsubsection{Choice of a Likelihood Function}

Note that the above formulation is general. In particular, the form of
the probability $q(\psi_i|\upsilon_i,y_i=k,\theta)$ was not specified.
A key question in instantiating our approach is: what form should be
used for $q(\psi|\upsilon,y,\theta)$?  This is the probability that
output $\psi$ was generated by the mapping function $y$, given the
input $\upsilon$ and model parameters $\theta$. In this work we
analyze the following possible cases:
\begin{enumerate}
\item A Gaussian joint distribution of input-output vectors:
%\begin{equation}
$q(\upsilon,\psi|y,\theta)=\cl{N}((\upsilon,\psi);\mu_{y},\Sigma_{y})$.
%\end{equation}
\item A Gaussian distribution, whose mean is the output of the
$y$-th mapping function:
%\begin{equation}
$ q(\psi|\upsilon,y,\theta)
 =\cl{N}(\psi;\phi_{y}(\upsilon,\theta),\Sigma_{y})$.
%\end{equation}
\end{enumerate}
\out{
One way to interpret (2) is that the error in estimating $\psi$,
given we know what mapping function to use, is Gaussian
distributed. %The distribution's mean is the output of the
%specialized function, and its covariance is dependent on the
%specialized function used.
These are the two forms tested in our experiments; however, this formulation is general, and can accept other forms for the
likelihood function.}

\subsection{The Generative Model}

\changed {Our approach involves the use of a generative model of images
(or image features). In the problem of human body pose estimation from
a single image this generative model can be defined in a simple
way. We will assume that an image or image features are generated by
sampling a pose from a prior distribution $p(\mb{h})$ and an image is
then generated using the rendering function $\zeta$ such that:
\beqa
\label{eq:zetaNormalDist}
%\label{eq:gmtd}
p(\mb{x}|\mb{h})={\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta).  \eeqa It
is important to notice that despite the fact that the generative model
can be defined in a simple manner, the function $\zeta$ is highly
complex (non-linear); this makes probabilistic inference
intractable. In establishing a connection to previous methods, this
inference problem is usually referred to as {\it tracking}. Fitting an
articulated model (\eg composed of solid primitives) is equivalent to
a form of probabilistic inference with several important, well-known
drawbacks: this problem requires non-linear optimization of a very
complex function and a good initial guess is difficult to determine
automatically (this is usually provided by manual placement of the articulated model). This form of fitting also have other drawbacks already
explained.}

\section{Learning}
\label{sec:Lea}

\changed{An approximation method will be used in learning the
discriminative model parameters}.  We will employ an Expectation
Maximization (EM) approach. EM provides a general framework for
solving the maximum likelihood parameter estimation problem in
statistical models with hidden variables, like Eq.\
\ref{eq:LogSum}. Since the EM algorithm is well known
\cite{Dempster77,Amari95,Neal98}, we will only provide derivations
specific to our formulation.

Note that the unobserved random variables $y_i$ are independent given
$\mb{z}_i$. Thus, the E-step reduces to computing the posterior
probabilities for each $y_i$ given the model parameters and observed
data. We will denote this posterior
$Q(y_i=k|\psi_i,\upsilon_i,\theta)$ using the shortcut notation
$\tilde{Q}^{(t)}(y_i=k)$. We then have:
\begin{equation}
\tilde{Q}^{(t)}(y_i=k)=\lambda_{k}q
(\psi_i|\upsilon_i,y_i=k,\theta^{(t-1)})/\sum_{j \in \cl{C}} \lambda_j
q(\psi_i|\upsilon_i,y_i=j,\theta^{(t-1)}).
\end{equation}
Stated differently, this step estimates the responsibility of each
mapping function, $\phi_k$ for each data point,
$\mb{z}_i$. \changed{$\tilde{Q}^{(t)}(y_i=k)$ represents the so called
responsibility of function $k$ for data pair $i$. Also recall that
$\lambda_i=Q(y_i)$ is the prior probability that function $y_i$ be
used.}

The M-step consists of finding $\theta^{(t)}=\arg\max_\theta
E_{\tilde{Q}^{(t)}}[\log q(\cl{Z},\mb{y}|\theta)]$. In both of our
cases we can show that this is equivalent to finding:
\begin{equation}
\label{eq:MDef} \theta^{(t)}=\arg\max_{\theta} \sum_i \sum_{k \in
\cl{C}} \tilde{Q}^{(t)}(y_i=k) [\log q(\mb{z}_i|y_i=k,\theta)+ \log
Q(y_i=k|\theta)].
\end{equation}

It is important to mention that this is valid if
$q(\mb{z}_i|\theta)$ depends on $y_i$ and not on $y_j$, for any
$j\neq i$.  Note that for the distributions discussed above, this
is true. We now present solutions for the cases described above.

\subsection{Case (1)}

In this case we have:
\begin{equation} q(\upsilon,\psi|y,\theta)=
\cl{N}(\upsilon,\psi;\mu_{y},\Sigma_{y})= \cl{N}(\left[
\begin{array}{c}
\upsilon \\
\psi \\
\end{array}
\right];\left[
\begin{array}{c}
\mu_\upsilon \\
\mu_\psi \\
\end{array}
\right],
\left[
\begin{array}{cc}
\Sigma_{\upsilon\upsilon} \Sigma_{\upsilon\psi}\\
\Sigma_{\upsilon\psi}^\top \Sigma_{\psi\psi} \\
\end{array}
\right] )_{y}.
\end{equation}
We can show that the parameter learning problem
is reduced to a mixture of Gaussian estimation, for which it is
straightforward to estimate $\theta$ using EM. Moreover, the
Bayesian estimate of $\psi$ given an observed $\upsilon$ is also
Gaussian: $
%\begin{equation}
q(\psi|\upsilon,y,\theta)=\cl{N}(\psi;\mu_\psi+\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}(\upsilon-\mu_\upsilon),\Sigma_{\psi\psi}-\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}\Sigma_{\upsilon\psi})_{y}.$
%\end{equation}
Therefore in case (1), each function $\phi_{k}$ is just the mean of
the conditional distribution
\begin{equation}
\phi_k(\upsilon,\theta)=(\mu_\psi+\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}(\upsilon-\mu_\upsilon))_{y=k}.
\label{eq:phiGaussian}
\end{equation}
The confidence of the estimate is given by the covariance
%\begin{equation}
$\Sigma_k =
(\Sigma_{\psi\psi}-\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}\Sigma_{\upsilon\psi})_{y=k}.$
%\label{eq:sigmaGaussian}
%\end{equation}
However, this expression
does not depend on the input, a sometimes undesirable consequence
of the given model. Thus, each function $\phi_k$ is linear in the
input vector from ${\Re}^c$.


\subsection{Case (2)}

In this case we have:
\begin{eqnarray}
\label{eq:lambda_der}
\frac{\partial E}{\partial \lambda_k} &=&  \sum_i \tilde{Q}^{(t)}(y_i=k)
\frac{\partial}{\partial \lambda_k} \log Q(y_i=k|\theta)\\
\label{eq:sigma_der} \frac{\partial E}{\partial \Sigma_k} &=&
\sum_i \tilde{Q}^{(t)}(y_i=k)
\frac{\partial}{\partial \Sigma_k} \log q(\psi_i|y_i=k,\upsilon_i,\theta_k)\\
\frac{\partial E}{\partial \theta_k}&=&\sum_i
\tilde{Q}^{(t)}(y_i=k) [(\frac{\partial}{\partial
\theta_k}\phi_{k}(\upsilon_i,\theta_k))^\top\Sigma_{k}^{-1}
(\psi_i-\phi_{k}(\upsilon_i,\theta_k))], \label{eq:theta_up0}
\end{eqnarray}
where $E$ is the cost function that we would like to maximize in Eq.~\ref{eq:MDef}.
%RRChange Eq above, [there was an error]
%%SS: OK

This gives the following update rules for $\lambda_k$ and
$\Sigma_k$, where Lagrange multipliers were used to incorporate
the constraint that the sum of the $\lambda_k$'s is 1:
\begin{eqnarray}
\label{eq:lambda_up}
\lambda_k^{(t)}&=&\frac{1}{N}\sum_i\tilde{Q}^{(t)}(y_i=k)\\
 \label{eq:Sigma_up} \Sigma_k^{(t)}&=&\sum_i
\tilde{Q}^{(t)}(y_i=k) (\psi_i-\phi_{k}(\upsilon_i,\theta_k))
(\psi_i-\phi_{k}(\upsilon_i,\theta_k))^\top/\sum_i\tilde{Q}^{(t)}(y_i=k)
\end{eqnarray}

To keep the formulation general, we have not yet defined the form
of the mapping functions $\phi_k$. Whether or not we can find
a closed form solution for the update of $\theta_k$ depends on the
form of $\phi_k$. For example if $\phi_k$ is a non-linear
function, we may have to use iterative optimization to find
$\theta_{k}^{(t)}$. If $\phi_k$ yields a quadratic form, then a
closed form update exists.

%\comment{Is there some place in this paper that provides the
%details of the update for the $\phi_k$ you used?  For instance NN
%or other function? As it is, the paper is incomplete; it does not
%give all of the details that someone needs to duplicate your
%system.}

\changed{Regarding our generative model, there is is very little
learning involved. If $\zeta$ is very accurate, then we could also
tell very accurately the image that will be generated given a body
pose $\mb{h}$. In practice $\zeta$ can be defined only approximately.
We account for this by properly setting $\Sigma_\zeta$.}

% ATT the prior, ATT \Simga_z
% However, the prior probability over poses
%$p(\mb{h})$ is unknown, but interestingly, as we will see in the
%following section, we do not need to specify it in our generative
%model.}

\out{
\subsection{Stochastic Learning}

The aforementioned optimization equations for the discriminative model
can be used to find a local minimum given the initial parameter
values. In order to improve this process, and avoid some of the local
minima that inevitably arise, we use an annealing schedule on the
$\tilde{Q}^{(t)}$ probabilities during the M-step. In this way, we
redefine:
\begin{equation}
\tilde{Q}^{(t)}(y_i=j) \leftarrow
\frac{e^{\log(\tilde{Q}^{(t)}(y_i=j))/T(t)}}{\sum_{k \in \cl{C}}
e^{\log(\tilde{Q}^{(t)}(y_i=k))/T(t)} }.
\end{equation}

In our experiments, the temperature parameter $T$ decays
exponentially. This step not only helps in avoiding local minima,
but it also creates two desirable effects. It forces
$\tilde{Q}^{(t)}(y_i=j)$ to be binary (either $1$ or $0$) at low
temperatures; as a consequence each point will tend to be mapped
by only one function at the end of optimization.
Moreover, it makes $\tilde{Q}^{(t)}(y_i=k)$ ($k=1,2,...,M$) be
fairly uniform at high temperatures, making the optimization less
dependent on initialization.}

%Note that in some cases, there is no closed-form solution for the
%M-step. In practice we have decided to perform two or three
%iterations per M-step. A source of randomness added to the process
%so far described consists of choosing data points randomly and
%uniformly distributed when performing the M-step. These two
%variants of the M-step have been justified in the sense of a
%partial M-step \cite{Neal98}.

%\comment{While the above paragraph makes some sense, it is really
%unclear how you are actually performing the M-step.  It is best
%understood in an example (say for your MLP that would be used in
%the experiments anyway). Perhaps a new subsection is needed here
%to give a summary of the learning algorithm, and the MLP example.}

\section{Inference}

\label{sec:InfSMA} 
%%\changed whole section!!
\changed{ In this section, we refer to probabilistic inference as
finding a full probability distribution for $\mb{h}$ given that
$\mb{x}=\mb{x}^*$ once an observation $\mb{x}^*$ has been made (\eg
some image features were observed).}

\subsection{Inference using the Discriminative Model Alone}
\changed{ \out{Learning the discriminative model yields a set
of  functions that map elements from the input space to the
output space. Each of the specialized functions maps different parts
of the input space with different levels of accuracy. This mapping
behavior is described probabilistically by $q$ in
Eq.~\ref{eq:LeaDisMod}.} A valid approach to inference is to use the
discriminative model alone. In order to understand how this differs
from our proposed solution (where we combine both, generative and
discriminative models), we will now show what inference involves in
terms of maximum a posteriori (MAP) estimation using the
discriminative model.}

\changed{In a general sense inference involves finding a
full probability distribution for $\mb{h}$ given $\mb{x}^*$; the
discriminative model directly provides this expression. In MAP
estimation we just have to maximize it (\ie we want to find the most
likely output hypothesis $\mb{h} \in \Re^t$ for a given observation
$\mb{x}^* \in \Re^c$):
%\begin{equation}
$\hat{\mb{h}}=\arg\max_\mb{h} q(\mb{h}|\mb{x}^*)=\arg\max_\mb{h} \sum_y
q(\mb{h}|\mb{x}^*,y) Q(y)$,
%\end{equation}
where $q(\mb{h}|\mb{x}^*)$ is a shorthand for
$q(\mb{h}|\mb{x}=\mb{x}^*)$. Any further treatment depends on the
properties of the probability distributions involved.}

In both Cases (1) and (2) considered in previous sections, we can
write $q(\mb{h}|\mb{x},y)=
{\cl{N}}(\mb{h};\phi_y(\mb{x}),\Sigma_y)$.
%In Case (2), by
%definition this is exactly the form of the conditional
%distribution. In Case (1), the form of $\phi_y$ and the covariance
%are described in Eqs. \ref{eq:phiGaussian} and
%\ref{eq:sigmaGaussian}.
\changed{Thus, in either case we have that $q(\mb{h}|\mb{x}^*)$ is a mixture of
Gaussians and if we want to find the MAP estimate we need to solve: }
%\begin{equation}
$\label{eq:StdInf}
\hat{\mb{h}}=\arg\max_\mb{h} \sum_y
{\cl{N}}(\mb{h};\phi_y(\mb{x}^*),\Sigma_y) Q(y).$ 
%\label{eq:hMix}
%\end{equation}

This result was obtained from performing (MAP) inference using our
learned discriminative model alone, where we learned
$q(\mb{h}|\mb{x})$ as an approximation to the true distribution
defined by $p(\mb{h}|\mb{x})$, using the training data. Even though we
could simply adopt this as a solution, it should not be surprising
that we could improve upon this by using our knowledge of $p$.

%However, we have yet to make use of the inverse (rendering) function
%$\zeta:{\Re}^t\rightarrow{\Re}^c$ in our framework.
%RRChange Eq.~\ref{eq:StdInf}...
%% SS: OK

\subsection{Inference Using the Generative Model Alone}
\label{sec:Inf2}

%(recall that the generative model is built from knowledge of the function $\zeta$, the image generating function, thus we use the term 'true' posterior)

Using the generative model, inference involves finding the posterior $p(\mb{h}|\mb{x}=\mb{x}^*)$ ($p(\mb{h}|\mb{x}^*)$ as a shorthand):
\beqa
p(\mb{h}|\mb{x}^*)&=&p(\mb{x}^*|\mb{h})p(\mb{h})=\frac{1}{Z_p}{\cl N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})\\
\label{eq:Zp}
Z_p&=&\int {\cl N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})d\mb{h}.
\eeqa
There are however at least two difficult obstacles for achieving this:(1) The integral in Eq.~\ref{eq:Zp} cannot be solved easily and moreover, (2) we do not have an  expression for $p(\mb{h})$.

In MAP estimation, the goal is to find $\hat{\mb{h}}$ such that:
%\begin{equation}
$\hat{\mb{h}}=\arg\max_\mb{h} p(\mb{h}|\mb{x})=\arg\max_\mb{h} {\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h}),$
%\end{equation}
needless to say that in the case of body pose estimation, this is in
general, a highly complex non-linear optimization problem (tracking) as we have seen before.

A key idea in this paper is that both obstacles would become much
simpler if, somehow, we could accurately obtain samples from
$p(\mb{h}|\mb{x})$. Those samples could be used to (1) approximate
this posterior and (2) find the most likely sample for MAP
estimation. However sampling accurately from a given distribution, in
particular $p(\mb{h}|\mb{x})$, is in general an open problem
\cite{Mackay98}. 

\subsection{Inference and Importance Sampling. Combining Generative and Discriminative Models}
\label{sec:GenInf}
In general, sampling can be used to estimate expectations of a given function
$I(w)$ with respect to some probability density $\pi(w)$ that we can
evaluate at any point, but that we cannot sample from. Let us say we
need to calculate the integral 
%\beqa 
$\label{eq:int}
{\cl I}=\int \pi(w)I(w)dw$, 
%\eeqa 
by approximating ${\cl I}$ employing  $R$  samples: $\hat{\cl I}=\frac{1}{R}\sum_{r=1}^R I(w^{(r)})$.  

The question is how to appropriately generate the samples to obtain the
best estimate for ${\cl I}$. Sampling can be used in more general
tasks.  Usually it is only necessary to evaluate $\pi^\star(w)$ which
is equal to $\pi(w)$ to within a multiplicative factor,
$\pi(w)=\pi^\star(w)/Z$. 

%Thus, in Eq.~\ref{eq:int} $\pi$ could instead
%be a positive function.

Since we cannot usually generate samples accurately, we need to
account for our sampling inaccuracies. Importance sampling is a method
that accounts for this as follows: first we come up with a proposal
distribution $\pi'(w)$, which we can also evaluate (within a
multiplicative factor) but from which it is possible to
sample; then we sample from $\pi'(w)$, but also correct for the bias
introduced when sampling:
\beqa
\hat{\cl I}=\frac{1}{R}\sum_{r=1}^R \frac{\pi^\star(w^{(r)})}{\pi'(w^{(r)})}I(w^{(r)}).
\eeqa
\out{
The variance of this estimator is given by:
\beqa
\sigma^2(\hat{\cl I})=\frac{1}{R(R-1)}\sum_{r=1}^R (\frac{\pi^\star(w^{(r)})}{\pi'(w^{(r)})}I(w^{(r)})-\hat{\cl I})^2.
\eeqa}
It can be shown that when $R\rightarrow\infty$, $\sqrt{R}(\hat{\cl I}-{\cl I})\sim{\cl N}(0,\sigma^{2}_{\pi'})$, with: $\sigma^{2}_{\pi'}=\int (\frac{\pi^\star(w)}{\pi'(w)}I(w)-{\cl I})^2 \pi'(w)dw$. Thus, the expected variance of our estimate is proportional to $\sigma^{2}_{\pi'}$ and inversely proportional to $R$ \cite{Mackay98}.

We would like to know what the optimal proposal distribution is in order
to maximize the accuracy of our estimator ( minimize
$\sigma^{2}_{\pi'}$). The optimal proposal distribution to approximate
${\cl I}$ is given by \cite{Rubinstein81,Cheng00}: \beqa
\label{eq:Just}
\pi'(w)=\pi^\star(w)/\int \pi^\star(w)dw,
\eeqa
the normalized function $\pi^\star(w)$. Since, on purpose, we introduced $\pi^\star(w)$ first as a unnormalized distribution originating from  $\pi(w)$, we know that upon normalization we will get back the original distribution $\pi(w)$. A similar justification was used in \cite{Cheng00}, but their way of building a proposal distribution is different from ours.

\changed{Thus, in finding a posterior distribution $p(\mb{h}|\mb{x})$ for body
poses given observed image features the partition function in}
Eq.~\ref{eq:Zp} can be computed as follows: \beqa
\label{eq:hatz}
Z_p=\int p(\mb{x}^*,\mb{h})d\mb{h} \approx \frac{1}{S} \sum_{s=1}^S p(\mb{x}^*,\mb{h}\us)/p(\mb{h}\us|\mb{x}^{*}),
\eeqa
using importance sampling, with samples taken from $p(\mb{h}|\mb{x}^*)$.
%\beqa
%\label{eq:hatz}
%\hat{Z}_p=\frac{1}{R} \sum_{s=1}^S p(\mb{x}^*,\mb{h}\us)/p'(\mb{h}\us),
%\eeqa 
We have shown that the best proposal distribution is
$p(\mb{h}|\mb{x}^*)$; unfortunately this is the distribution we are after, and of course we cannot sample from it.

The main reason behind using generative and discriminative models
together is to tackle this particular problem of sampling from a good
distribution. We will use the learned distribution $q(\mb{h}|\mb{x})$
(discriminative model) to approximate $p(\mb{h}|\mb{x})$, but just at
$\mb{x}=\mb{x}^*$. This approximation is in terms of maximum likelihood
estimation and can also be seen as minimizing the KL divergence
between the empirical distribution $p_e$, given by the training data,
and the model distributions $q$, parameterized by $\mb{\theta}$:
\beqa
{\rm KL}(p_e(\mb{x},\mb{h})||q(\mb{x},\mb{h}))=\int p_e(\mb{x},\mb{h}) \log [p_e(\mb{x},\mb{h})/q(\mb{x},\mb{h})] d\mb{h} d\mb{x},
\eeqa 
which can be proven to be equivalent to:
\beqa
\arg\min_{\rm{\theta}} E_{p_e(\mb{x})}[{\rm KL}p_e(\mb{h}|\mb{x})||q(\mb{h}|\mb{x}))],
\eeqa
where $\theta$ parameterizes $q$. In practice, the expectation becomes a sum over the training data pairs, and we obtain Eq.~\ref{eq:LeaDisMod}. Thus, the optimal distribution in this sense is the one that results from solving  Eq.~\ref{eq:LeaDisMod}, to obtain $q(\mb{h}|\mb{x})$. Of course, we assume that the data is composed by representative examples from $p$, so that the empirical distribution $p_e$ is at all useful.
%ATT
Eq.~\ref{eq:Just} justifies this choice since it tells us that in order
to find a good approximation for the posterior $p(\mb{h}|\mb{x})$ we
should find a proposal distribution that is similar to it, as
intuitively expected. We may then ask if we could use this proposal
distribution alone. The reason why this is not a good idea is that,
since we cannot usually find a proposal distribution that matches the
true posterior perfectly. Using this proposal distribution alone is
expected to perform worse than when combined with our accurate
generative model. This is mainly because in regions where the proposal
distribution $q$ is bad at approximating $p$, we can always evaluate
$p$ and note the discrepancy.
%ATT^

The distribution $q(\mb{h}|\mb{x})$ is an approximation to
$p(\mb{h}|\mb{x})$ in the space of all distributions with the
structure specified by the discriminative model (a mixture model in our
case). For Gaussian mixture models, it is know that this approximation
can be made as accurate as we wish in the limit of infinite data and
mixture components. Interestingly, we do not need to know explicitly
what $p(\mb{h})$ is in our generative model. Of course, this is
implicitly specified by the training data.  Thus, even if we use a not
so good assumption for $p(\mb{h})$, still we know what we need to do
in order to achieve a good estimate of the posterior. This is helpful
since we do not really know accurately what $p(\mb{h})$ is (given that
we may not have enough data to estimate it accurately). In the
following we simply use an uniform distribution (in a reasonable
finite domain).

To summarize, in order to compute the posterior distribution of body
poses $\mb{h}$, given an observation of image or image features
$\mb{x}^*$, we calculate and estimate for $p(\mb{h}|\mb{x}^*)$ as follows:
\beqa
\hat{p}(\mb{h}|\mb{x}^*)=\frac{1}{\hat{Z}_p}{\cl N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h}),
\eeqa
with $\hat{Z}_p$ given by Eq.~\ref{eq:hatz}, but substituting $p(\mb{h}|\mb{x}^*)$ by  $q(\mb{h}|\mb{x}^*)$ and using samples from $q(\mb{h}|\mb{x}^*)$.
%ATT H
%% Assuming
%% that the data is composed by representative examples from $p$ we hope
%% to learn a good
%% Thus, this justifies why learning a discriminative distribution
%% $q(\mb{h}|\mb{x})$ is a sensible approach. When learning $q$ from
%% training data, we are trying to approximate $p$ assuming that the data
%% is composed by representative examples from the true distribution.
%% It is known that a mixture of Gaussians can approximate
%% any distribution if enough mixture components are used. Thus, in the
%% limit of infinite data and a large enough mixture our discriminative
%% distribution $q$ could in theory approximate the generative
%% distribution $p$.
%% p(x*)=\int p(x*,h) dh
%% approx with (using IS)
%% p(x*)=1/R \sum p(x*,h)/p'(h)
%% Rubinstein says that the best dist that we can use to sample is p(x*,h) normalized, so that it is a valid pdf, which is p(x*,h)/int p(x*,h) dh=  p(x*,h)/int p(x*) = p(h|x*)!!!!
\subsection{Non-deterministic MAP Estimation: Multiple Samples (MS)}
\label{sec:MS}

We are usually interested in providing likely samples from the
posterior distribution, in particular we might be interested in the most likely $\mb{h}$. This is the idea behind MAP estimation, where we are interested in
finding 
%\beqa
%\label{eq:ForMAP}
$\hat{\mb{h}}=\arg\max_\mb{h} p(\mb{h}|\mb{x}^*)=\arg\max_\mb{h} p(\mb{x}|\mb{h}^*)p(\mb{h})$.
%\eeqa

We know that the discriminative model distribution $q(\mb{h}|\mb{x})$
tries to approximate $p(\mb{h}|\mb{x})$, and therefore it is good at
minimizing the variance of the estimator. Due to this, we will use the
discriminative model distribution to provide samples for MAP
estimation. In MAP estimation, we sample ${\cl H}_{Spl}= \{\mb{h}_s
\}_{s=1...S}$ using the proposal distribution
$q(\mb{h}|\mb{x}^*)$. Given the samples, the problem the becomes a
discrete optimization problem that can be solved easily:
\begin{equation}
\label{eq:ASolM}
{\hat s}=\arg\max_s p(\mb{x}^*|\mb{h}_s)=\arg\min_s
(\mb{x}^*-\zeta(\mb{h}_s))^\top \Sigma_\zeta
(\mb{x}^*-\zeta(\mb{h}_s)),
\end{equation}
by using the Gaussian form of $p(\mb{x}|\mb{h})$ as given in Eq.\
\ref{eq:zetaNormalDist}.  We remark that after using the samples ${\cl
H}_{Spl}$ as a starting point, other more sophisticated methods could
be employed. For example we could use Markov chain Monte Carlo (MCMC)
sampling \cite{Mackay98} to search for regions of higher
probability. Also, instead of stochastic methods, we could employ
standard gradient descent methods to locally search for more likely
poses $\mb{h}$ (as in tracking). These methods may be helpful for some
distributions but in general have several drawbacks: (1) They are
usually very slow in high dimensions and (2) given finite time, they
are not very useful/accurate if the posterior probability is very
complex. Some methods have been proposed to alleviate these problems,
but this goes beyond our current contribution. Keeping this extension
in mind, in this paper we simply use the original samples ${\cl
H}_{Spl}$ to search for a MAP estimate. These estimates proved to be
sufficiently accurate during our experiments.

%% Let us assume that we can approximate $\sum_y p(\mb{h}|\mb{x},y)
%% P(y)$ by a set of samples generated according to
%% $p(\mb{h}|\mb{x},y) P(y)$ and a kernel function
%% $K(\mb{h},\mb{h}_s)$, such that $K(\mb{h},\mb{h}_s) \geq 0$ and
%% $\int K(\mb{h},\mb{h}_s) d\mb{h}=1$ for any given $\mb{h}_s$.
%% Given a set of samples ${\cl H}_{Spl}= \{\mb{h}_s \}_{s=1...S}$,
%% we can construct the approximation $\sum_y p(\mb{h}|\mb{x},y) P(y)
%% \approx \frac{1}{S} \sum_{s=1}^S K(\mb{h},\mb{h}_s)$. We now
%% consider two simple forms for the kernel function $K$.

%% If we use a Dirac delta function kernel centered at each sample
%% $K(\mb{h},\mb{h}_s)=\delta(\mb{h}-\mb{h}_s)$, then we have: $
%% \mb{h}^* \approx \arg\max_\mb{h} p(\mb{x}|\mb{h})
%% \frac{1}{S}\sum_{s=1}^S \delta(\mb{h}-\mb{h}_s)$. This can be
%% reduced to an equivalent discrete optimization problem where the
%% goal is to find the most likely sample $s^*$:
%% \begin{equation}
%% \label{eq:ASolM}
%% s^*=\arg\max_s p(\mb{x}|\mb{h}_s)=\arg\min_s
%% (\mb{x}-\zeta(\mb{h}_s))^\top \Sigma_\zeta
%% (\mb{x}-\zeta(\mb{h}_s)),
%% \end{equation}
%% by using the Gaussian form of $p(\mb{x}|\mb{h})$ as given in Eq.\
%% \ref{eq:zetaNormalDist}.

%% If instead we use Gaussian kernels centered at each sample
%% $K(\mb{h},\mb{h}_s)={\cl N}(\mb{h};\mb{h}_s,\Sigma_{Spl})$, then
%% we have: $\mb{h}^* \approx \arg\max_\mb{h}  p(\mb{x}|\mb{h})
%% \frac{1}{S}\sum_{s=1}^S {\cl N}(\mb{h};\mb{h}_s,\Sigma_{Spl})$.
%% This approximation is harder to use in practice. Unlike the Dirac
%% delta kernel approximation, the Gaussian approximation cannot be
%% reduced to an equivalent discrete optimization since there is no
%% guarantee that the optimal $\mb{h}$ for this form is among the
%% samples in general.
\subsection{Deterministic MAP Estimation: Mean Output (MO)}
\label{sec:MO}

In certain applications, it might be advantageous to count with a very
fast method for computing MAP estimates. Two examples are: when
working with multiple articulated bodies and in dynamic settings where
it is necessary to provide estimates at a high rate. Even though the
time complexity of MS scales linearly with the number of samples, this
might not be fast enough. Motivated by speed constraints, here we
propose a very fast MAP estimation algorithm that still performs well
in experiments. Unlike MS, this algorithm is deterministic.

The structure of the problem, as well as the form of the
discriminative distribution components (\ie conditioned on the mixture
label) $q(\mb{h}|\mb{x},y)$ employed (Gaussian), make it possible to
construct this deterministic approximation. The basic intuition is
straightforward. For a given $\mb{x}=\mb{x}^*$, we {\em ask} each
mapping function $\phi_k$ to give its most likely estimate for
$\mb{h}$.  We then evaluate the probability of each function's
estimate via the generative model distribution
$p(\mb{x}^*|\mb{h})$. This approximation is good in practice, as will
be demonstrated in the experiments.

To justify this deterministic approximation, we note that due to
\changed{its} concavity properties, the probability of the mean is
maximal in a Gaussian distribution; \ie it is the most-likely value.
Formally, in both Case (1) and Case (2) described earlier,
$q(E[\mb{h}|\mb{x}^*,y,\theta])\geq q(\mb{h}'|\mb{x}^*,y,\theta)$, for
any $\mb{h}'$. Consider again the set of samples ${\cl H}_{Spl}=
\{\mb{h}_s \}_{s=1...S}$ generated in the MS approximation. We can
build a set of samples ${\cl H}_{\phi}=\{\mb{h}_{k}^{\phi}
\}_{k=1...M}$ that has the property
%\begin{equation}
$\forall y, \max_k q(\mb{h}_{k}^{\phi}|\mb{x}^*,y) \geq \max_s
q(\mb{h}_{s}|\mb{x}^*,y)$,
%\end{equation}
simply by setting $\mb{h}_{k}^{\phi}=\phi_k(\mb{x}^*,\theta)$.

This insight leads to a deterministic approximation for inference, the
{\it Mean Output} solution (MO). This approximate solution relies on
the observation that by considering the means $\phi_s(\mb{x}^*)$, we
would be considering the most likely output of each mapping
function (\ie each mixture component in the discriminative model),
given the input. Obviously we expect the discriminative model provides
a good approximation of our generative model posterior distribution as
discussed above. Also, the smaller the overlap among the distributions
associated with each function, the better the accuracy of
this approximation.

In MO approximate inference, the expression to be minimized is the
same as that used in Eq.\ \ref{eq:ASolM}, except for the use of
the $M$ means instead of the $S$ samples:
\begin{equation}
\hat{k}=\arg\max_{k \in {\cl C}}  p(\mb{x}^*|\mb{h}_{k}^{\phi})
=\arg\min_{k \in {\cl C}} (\mb{x}^*-\zeta(\mb{h}_{k}^{\phi}))^\top
\Sigma_\zeta (\mb{x}^*-\zeta(\mb{h}_{k}^{\phi})). \label{eq:ASolU}
\end{equation}
This generally requires substantially less computation than would be
required in the MS approach.

\section{Example Application: Articulated Pose from Visual Features}
\label{sec:Apps}

The formulation presented in this paper is rather general, and could
be applied in a number of supervised learning problems for which the
output-to-input (feedback) map is relatively easy to compute;
\changed{ thus allowing us to specify an accurate generative
model}. To demonstrate and test our framework, we have developed a
system that uses the our approach to infer articulated pose from
low-level visual features. In particular, we focused on pose
estimation of the human hand and body from an image silhouette. In
this class of computer vision applications, ground truth datasets for
training can be obtained via motion capture gloves or body suits, and
computer graphics rendering can be used to generate the input-output
pairs used in supervised learning.  We will now give details of this
demonstration system.

\subsection{3D Hand Pose Estimation}
\label{sec:AppsHand}

The goal is to recover detailed 3D hand pose from silhouette features
computed from a single color image. Hand pose is defined in terms of
the hand joint angles. In general, we are also interested in global
orientation of the hand. We explore two applications: estimation of
the internal joint angles only, and later, estimation of both internal
joint angles and global orientation of the hand.

\subsubsection{Hand Model}

We utilize the hand model provided in the VirtualHand programming
library \cite{virtual_hand}. The model parameters are 22 joint
angles. For the index, middle, ring and pinky finger, there is an
angle for each of the distal, proximal and metacarpophalangeal
joints. For the thumb, there is an inner joint angle, an outer
joint angle and two angles for the trapeziometacarpal joint. There
are also abduction angles between the following pairs of
successive fingers: index/middle, middle/ring and ring/pinky.
Finally, there is an angle for the palm arch, an angle measuring
wrist flexion and an angle measuring the wrist bending towards the
pinky finger. However, because the former two wrist angles also
encode global orientation, we decided not to model them in our
application. Hence, ignoring these two angles, our model has 20
DOF for the internal hand configuration.

All of these 20 angles are relative to two global orientation
angles. These two angles will encode the camera viewpoint (or
alternatively hand 3D rotation). Imagine a sphere surrounding the
hand model, \ie a fixed hand center point is at the center of the
sphere. For ease of reference, we will employ the widely used
latitude and longitude notions. The first angle $\beta_1$
represents the latitude from which we are looking at the hand, the
second angle $\beta_2$ represents the longitude. We have defined
$\beta_1 \in [0,\pi]$, with zero and $\pi$ being the {\it poles}
of the sphere and $\beta_2 \in [0,2\pi)$. Thus, in summary our
full hand model has 22 DOF.


\psfigurepath{./figs/}
\begin{figure}[t]
\centerline{\small
\psfig{figure=AllViewsHandBin.ps,width=0.5\textwidth} }
\mycaptionS{\small Example of the 86 silhouettes obtained via
computer graphics rendering for a given a 3D hand pose.  Views are
distributed approximately uniformly over the view
sphere.}\label{fig:HAllViews}
\end{figure}

\subsubsection{3D Hand Motion Datasets}
\label{sec:3DHDS}

Using a CyberGlove, we collected approximately 9,000 examples of 3D
hand poses.  This data included hand configurations from American Sign
Language (ASL) and other configurations informally performed by
several subjects. Using computer graphics and an artificial hand
model, we then rendered each captured hand pose from multiple
viewpoints on the view sphere. We defined a set of 86 viewpoint angle
pairs $(\beta_1,\beta_2)$ so that the sphere surface is sampled
approximately uniformly. Thus we obtained a full dataset of $9,000
\times 86$ views.  Each view has an associated binary image mask
(silhouette), and a 22 DOF pose vector. Fig.\ \ref{fig:HAllViews}
shows the 86 viewpoints used in the dataset for a particular
configuration.

From these silhouettes, we extract the visual features that will
be used for further processing. In our implementation, we used two
classes of features (these features are not used together): Hu
moments and Alt moments. Alt moments \cite{Alt62} are translation
and scale invariant, but not rotation invariant. Hu moments
\cite{Hu62} are invariant to translation and scaling, but also
invariant to rotation in the image plane. These moment features
were used in our implementation because they are relatively easy
to compute, and they provide invariants that are appropriate for
our demonstration application. However, our general formulation can be used with other visual feature representations
if desired. Detailed examination of the feature selection problem
is outside the scope of this paper, and remains a topic for future
research.

We define two experimental datasets:
\begin{enumerate}
\item {\em Hand-Single-View:} In this dataset, the hand is viewed
from only one viewpoint ($\beta_1=\pi/2$, $\beta_2=0$), generally
making the palm of the hand visible.  Silhouette features are
computed using Alt moments.  This yields approximately 9,000
input-output pairs.

\item {\em Hand-All-Views:} In this dataset, the hand is viewed
from all 86 viewpoints. Silhouette features are computed using Hu
moments. This yields approximately 750,000 input-output pairs.
\end{enumerate}

\subsubsection{Hand Detection and Segmentation}
\label{sec:segment}

For live video input, we will use video sequences collected with a
color digital camera. It will be assumed that these sequences have a
static background and only one person is present. In this
implementation, we are not considering hand occlusion analysis, which
by itself is a difficult task. Our system tracks both hands of the
user automatically using a skin color tracker \cite{sigal_2000,RosalesICCV01}.
%RRChange .... and the person is facing towards the camera  [is not needed]
%%SS: OK

\subsection{2D Human Body Pose Estimation}
\label{sec:2DBP}

In this application, our goal is to recover the articulated pose of a
human body observed in a single image. The methodology followed is
very similar to that used in the estimation of hand pose. However,
instead of joint angles, body pose will be specified in terms of
marker positions at a predetermined set of joints.  We will estimate
the 2D positions of these body markers in the image plane.

\subsubsection{Human Body Model}

The human body model is defined in terms of 20 3D marker positions
(60 DOF). The 20 markers are distributed as follows: three markers
for the head, three markers for the hip/back bone articulation,
plus one marker for each shoulder, elbow, wrist, hand, knee,
ankle, and foot. For computer graphics rendering, the body model
is composed of cylinders of equal width. The cylinders connect the
markers to form the standard human body structure. The thorax is
modeled using a wider cylinder. Because we are only interested in
the shape of the projected model, we do not include texture or
illumination in our rendering.

\subsubsection{Human Body Pose Dataset}

Human body motion capture data was obtained from several sources:
http://www.biovision.com, Matt Brand's dataset \cite{Brand99}, and
several demo sequences in the software package {\em Character Studio}.
In total there are 32 captured sequences that depict variations of
different activities: dancing, walking, kicking, waving, throwing,
jumping, signaling, crouching down. The total number of frames
collected is approximately 7,000, mostly at 30 frames/second. Using
computer graphics and our artificial body model, we then rendered each
frame from 16 equally-spaced viewpoints on the equator of the view
sphere centered at the hip of the body model. For each view, we also
used the camera model to obtain the 2D marker positions in the image
plane. Thus we obtained a full dataset of approximately $7,000 \times
16$ views.  Each view has an associated binary image mask (silhouette)
and a 40 DOF projected marker vector. From the silhouettes, we extract
the visual features that will be used as input. We have chosen Alt
moments \cite{Alt62} as our visual features, mainly due to their ease
of computation and invariance to translation and scaling. We call
this the {\em Body-All-Views} dataset.

\subsubsection{Detection and Segmentation}

\label{sec:BodyDet}

For live video input, we use sequences collected with a color digital
camera. It is assumed that these sequences have a static background,
only one person is present, and the person is fully-visible. We use a
simple and widely-used human body segmentation scheme
\cite{Hogg83,Wren96}. The technique employs statistical learning to
acquire a model of the background appearance, where each pixel's color
(luminance) is represented by a Gaussian distribution. Segmentation is
then approached using maximum-likelihood, where each pixel is
classified as belonging to the background or the foreground (human
body).

\out{
The above process yields a set of input-output (cue-pose) pairs to
be used in our experiments.  In this case, the cues are the Alt
moments for a particular view, and the pose is encoded in terms of
the projected locations of the body markers in the image plane (40
DOF).}

\subsection{Common Implementation Details}

We know briefly discuss implementation details common to both
applications.

\subsubsection{Mapping Functions}

In Sec.\ \ref{sec:ProMod}, it was not specified what class of
(deterministic) mapping functions ${\phi_k}$ were to be used. Our
framework is practically independent of this choice. However, from
Eq.\ \ref{eq:theta_up0} we can notice that there are clear advantages
in the M-step if these functions are differentiable with respect to
their parameters. In the case of quadratic or linear functions, the
M-step can be performed exactly in one step. However, the power of
these functions is limited.  In our implementation each function is a
multi-layer perceptron with one hidden layer (MLP). For this
non-linear function there does not exist a closed-form solution for
Eq.\ \ref{eq:theta_up0}. We used four to five iterations of the
conjugate gradient descent method per M-step.

\hide{
% this paragraph isn't needed
For the non-linear one hidden layer perceptrons, there does not
exist a closed form solution for Eq.~\ref{eq:theta_up0}. We use
the conjugate gradient (CG) optimization method, for performing
the M-step. If $\phi_k$ is a one hidden layer perceptron with
parameters $\theta_k$, we have:

%% This is simply a restatement of Eq 14. with nothing new.  So it should not be included.

\beqa \frac{\partial E}{\partial \theta_k}&=&\sum_n
\tilde{Q}(y_i)[(\frac{\partial}{\partial
\theta_k}\phi_{k}(\upsilon_n,\theta_k))^\top\Sigma_{k}^{-1}
(\psi_n-\phi_{k}(\upsilon_n,\theta_k))], \eeqa

% I thought about adding this, but it seems non-essential at this point (sorry).

Since in a one hidden layer perceptron the parameters are a set of
real-valued weights, let us explicitly denote the parameters of
$\phi_k$ as $\theta^k=\{w_{jil}^k\}$, where $w_{jil}^k$ denote the
synaptic weight from node $i$ to node $j$ in layer $l$, for the
function $k$ \cite{HaykinBook96}. Also, denote $\varphi$ the
non-linear function relating input $s$ to output activity $r$ in the
hidden layer nodes, \ie $r_i^{(2)}=\varphi(s_i^{(2)})$, the output
nodes are assumed linear, \ie $r_i^{(3)}=\alpha s_i^{(3)}$
\footnote{In both cases the biases are embedded in the function
definitions.}. With this re-parameterization, we can then show that
the gradient for  function $k$ is:


If $l=2$ ($w$ connects the hidden with the output layer): \beqa
\frac{\partial}{\partial
w_{jil}^k}\phi_{k}(\upsilon,\theta_k)=-r_i^{(3)}\varphi'(s_j^{(3)})
\eeqa

If $l=1$ ($w$ connects the input to the hidden layer): \beqa
\frac{\partial}{\partial
w_{jil}^k}\phi_{k}(\upsilon,\theta_k)=-r_i^{(3)}\varphi'(s_j^{(3)})\sum_q
\varphi'(s_q^{(2)}) w_{qj}^{(2)} , \eeqa

with $s_i^l$ the input in node $i$ in layer $l$ and $r_i$ its
corresponding output activity.}

\subsubsection{Generative Model Details: Feedback Functions}

There are at least two ways to define this function. On the one hand,
$\zeta$ could be a computer graphics rendering function. On the other
hand, we could estimate an approximate $\hat{\zeta}$ given a set of
output-input training examples. In our implementation, we experimented
with both ideas. For $\zeta$, we used computer graphics
renderings of our hand and body models obtained via OpenGL. For
$\hat{\zeta}$, we used a one hidden-layer perceptron, with twenty
hidden nodes. In our experience, this provides an adequate and
efficient approximation.
%RRChange .... with twenty hidden [replaced] with five hidden...
%%SS: OK

The approximate feedback function is useful primarily because it is
faster to compute than a graphical rendering followed by visual
feature computation. \changed{The key issue to keep in mind is that the
feedback mapping is assumed to be simple (one-to-one or even
many-to-one) or that it has a known form, otherwise if we assume too
simple functional forms, we would only introduce more estimation
errors. Of course, this is just a practical issue}. If the feedback
mapping is too complex to approximate easily, we could always rely on
the available feedback function $\zeta$.
%%RRChange [Added] many-to-one
%%SS: OK

\subsubsection{Computational Performance}

For an Athlon 1400 PC with 2GB memory, running unoptimized Matlab 6.0
code, it takes approximately five hours to train a model with 10
dimensions (input) and 10 dimensions (output), using 4500 patterns,
and 40 single hidden layer perceptrons with five hidden nodes
each. The system can infer body poses at approximately 11 frames per
second, using the Mean Output (MO) algorithm. \changed{This
approach's} related computations take approximately 70\% of this
time. This time includes OpenGL-based rendering of body poses in
$\zeta$. The rest is spent in segmentation and feature
calculations. The Multiple Sample (MS) algorithm takes time
proportional to the number of samples used. Of course, segmentation
and feature computation for the segmented image is done only once. We
noticed that for our implementation, if we use the approximate
feedback function, $\hat\zeta$, the rendering time is reduced to
approximately one-fourth.
%%RRChange, [I looked at my notes and fixed this]
%%SS: OK

\subsubsection{Early Stopping During Training}

During model training, we used cross-validation for early stopping and
to avoid over-fitting as follows:
%\footnote{The Minimum Description
%Length (MDL) principle \cite{Rissanen86} was also used to avoid
%overfitting as explained in the experiments}:
%%RRChange [added footnote]
%%SS: Removed: I must insist (sorry).
%% This is redundant with text elsewhere and unrelated to early stopping.

\begin{itemize}
\item {\em Training data:} Stop if the log-likelihood changes less
than 0.5\% averaged over the last ten iterations.
\item {\em Held out data:} Stop if the held out data
log-likelihood average change is negative over the last ten
iterations. Held out data was chosen in the same way as the
training and test data.
\item {\em Number of iterations:} Stop if a maximum of 200
iterations is reached.
\end{itemize}

\setlength{\tabcolsep}{1pt} %%SS: This changes separation between table columns
\renewcommand\arraystretch{0.25} %% SS: This changes separation between table rows

\section{Experimental Results}
\label{sec:Exp}

We now present experimental results obtained using our approach in
estimating the pose of the human hand and body. For many additional
performance experiments not included due to space limitations, the
reader is referred to \cite{RosalesPhDThesis} and for several MO
estimation videos to
http://www.psi.toronto.edu/$\sim$romer/SMAHandVideos.htm. The SMA
application independent Matlab code can be found at
http://www.psi.toronto.edu/$\sim$romer/SMACode.htm.

\subsection{Hand Pose Estimation Given a Fixed Camera Viewpoint}
\label{sec:FixCam}

In our first experiments, our approach is tested in the task of
recovering 3D human hand pose given a fixed camera viewpoint: a
view towards the palm of the hand. For training, we used the {\it
Hand-Single-View} dataset, which contains a total of approximately
9,000 examples. Of these, 3,000 were used for training and the
rest for testing.  All experiments were performed on a test set
that shared no common poses with the training set. The
input-output pairs were then defined as follows.  The input
consisted of 10 Alt moments computed from the silhouette of the
hand, as described in Sec.\ \ref{sec:AppsHand}. The output
consisted of 20 joint angles of a human hand linearly encoded by
nine values using Principal Component Analysis (PCA).

The number of mixture components for the discriminative model (mapping
functions) was set to 20. This number was found to be optimal in the
sense of the Minimum Description Length (MDL) principle
\cite{Rissanen86}; we found this number via a rough model search
(testing MDL and getting the score for the optimized model with
10,12,...,24 functions). Each mapping function (for each of the
Gaussians in the mixture) was a MLP with seven hidden neurons.

\subsubsection{Quantitative Results}

We randomly selected approximately 4,000 frames not included in the
training set. Since ground-truth is available, we used the average
absolute difference per joint angle (between ground-truth and
estimate) as error measure. Table ~\ref{tab:Err1} summarizes our
results (see caption).

\begin{table}[t] {
\begin{center}
\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\hline
 & MO-MAP ($\hat{\zeta}$) & MS-MAP ($\hat{\zeta}$) & MS-20 ($\hat{\zeta})$ & MO-MAP ($\zeta$) & MS-MAP ($\zeta$) & MS-20 ($\zeta$) & Rand/train & Range \\
\hline
\hline
$\hat{\cl E}$ & 0.1322 & 0.1667 & 0.1465 & 0.1651 & 0.1769 &0.1785 & 0.4294 & 1.55\\
\hline
$\sigma^2_{\hat{\cl E}}$ & 0.0317 & 0.0415 &0.0371 & 0.0425 & 0.0452 &0.0547 & 0.1630 & -\\
\hline
\end{tabular}
\end{center}}
\mycaptionS{Mean absolute error $\hat{\cl E}$ and variance
$\sigma^2_{\hat{\cl E}}$. Inference performance using different
rendering functions ($\zeta$ and $\hat{\zeta}$) and inference
algorithms (MO-MAP and MS-MAP). Also shown, the accuracy of the most
probable reconstructions given by MS (MS-20). As a point of
comparison, results are presented for an algorithm that randomly
chooses one of the training examples as result (Rand/train). The
average range of the data is also shown as a reference point. All
units are in radians.}
\label{tab:Err1}
\end{table}

\hide{
Using the estimated feedback function $\hat\zeta$ in the
Mean Output approach (MO), the average $L_2$ error between
reconstruction and ground-truth was $0.1863$ radians (approximately
$10^o$), with variance $0.0185$. These error estimates are averaged
over joint angles.  We ran this experiment with the same test set, but
instead used the computer graphics rendering feedback function
$\zeta$. When using $\zeta$, similar accuracy was obtained. The
average $L_2$ error between reconstruction and ground-truth in this
case was $0.241$ radians, with variance $0.0312$. Their symmetric KL
divergence is 0.134 bits. In \cite{RosalesPhDThesis}, we explain in
detail possible reasons for this difference in performance.
%SChange previous paragraph
}
\psfigurepath{./figs/H90}
\begin{figure*}[ht]
\centerline{\begin{tabular}{rcccccccccc}
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.02096.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03973.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03275.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01965.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01265.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.00655.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01729.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02576.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01877.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01091.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.02096.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03973.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03275.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01965.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01265.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.00655.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01729.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02576.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01877.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01091.eps,width=0.61in,clip=t} \\
%
\\
\out{
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.03942.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03569.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01572.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02273.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02575.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01681.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01659.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02401.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02751.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02183.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.03942.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03569.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01572.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02273.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02575.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01681.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01659.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02401.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02751.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02183.eps,width=0.61in,clip=t} \\
%
\\}
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.02663.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03842.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02162.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02353.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02369.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.04272.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.04048.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03872.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03856.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03840.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.02663.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03842.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02162.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02353.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02369.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.04272.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.04048.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03872.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03856.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03840.eps,width=0.61in,clip=t} \\
%
\\
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.03296.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02928.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02896.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02784.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02672.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01825.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02576.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02449.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02001.eps,width=0.61in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03328.eps,width=0.61in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.03296.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02928.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02896.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02784.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02672.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01825.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02576.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02449.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02001.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03328.eps,width=0.61in,clip=t} \\
\end{tabular}
} \mycaption{Estimated hand poses using Mean Output (MO) algorithm
and $\hat{\zeta}$}{\small  \CapTestI}{} \label{fig:H90Res}
\end{figure*}

These experiments quantitatively confirmed that MO inference provides
a reasonable approximation, at least for this dataset. Recall from
Sec.\ \ref{sec:MO} that MO inference was based on the premise that the
most-likely reconstruction given by each \changed{discriminative
mixture component} provides a good approximation to the best solution
given by the full probability distribution.

Fig.\ \ref{fig:H90Res} shows example reconstructions obtained via
the MO approach. In many cases, the reconstruction is close to the
ground truth. In other cases, the silhouette is highly ambiguous,
and the reconstruction does not match ground truth. A good example
is shown in image pair number 34 (the last row-pair, fourth column),
where the camera's image plane is perpendicular with the axis of
the pinky finger. Note that the estimated hand pose disagrees with
the ground-truth in the several joint angles associated with this
finger. Similar effects with other joint angles can be seen in
example pairs 8, 16, 27, etc.

Ambiguous configurations are indeed very common with a binary
image representation.  Note that in other ambiguous cases shown in
Fig.\ \ref{fig:H90Res} reconstruction is closer to ground truth,
\eg pairs 19, 20, etc. Possible reasons for this agreement are
diverse:
\begin{enumerate}
\item The input is not really ambiguous (probabilistically speaking)
in the observation space. The other possible outputs (geometrically
speaking) associated with this input may be very unlikely given the
training set. This depends on the underlying structure of the
configuration manifold. One of the main goals of a learning algorithm
is to find this structure. Indeed these results show that our
algorithm is finding this structure, since in most cases, MO finds a
valid sample from the manifold.
%RRChange [last sentence]
%%SS: OK
\item \changed{ The learned discriminative model was accurate at
modeling the given input using a single mixture component}(\ie few
mapping functions were trained to map this input, therefore the rest
of the functions produced irrelevant (bad) outputs).
\item By chance, among many very similarly probable solutions, the
{\it right} one was chosen. Of course, even with the help of chance in
this case, the discriminative model needed to be accurate enough at
approximating the true posterior so that samples were relevant at all.
%mapping functions needed to provide the
%right mapping for the given input $\mb{x}$.
\end{enumerate}

\hide{
The accuracy of the Multiple Samples (MS) inference approach was
tested in similar experiments with approximately $4,000$ randomly
chosen test examples not included in the training set. When the
estimated feedback function $\hat\zeta$ was used, the mean $L_2$ error
of the most likely sample to the ground-truth was $0.2202$ radians
with variance $0.0228$. The mean error and variance from the best 20
samples was $0.308$ and $0.0323$ respectively. When we performed the
same experiment, but instead used the computer graphics feedback
function $\zeta$, we obtained a mean error of $0.2628$ radians with
variance $0.0242$ for the most likely sample. The mean error of the
best 20 samples was $0.3128$ radians with variance $0.0300$.
}

%%RChange previous paragraph

\subsubsection{Experiments with Real Images}
\label{sec:H90RealImgs}

We now test our approach using uncalibrated video sequences, where the
camera is pointing towards the palm of a person's hand. On average,
the hand occupied an area of approximately $200 \times 200$
pixels. Segmentation was obtained as described in Sec.\
\ref{sec:segment}. In the first experiment, we use the MO approach to
obtain a single {\it best} estimate for each segmented hand. Estimates
for 40 frames, taken 0.9 seconds apart, are shown in Fig.\
\ref{fig:Real90TestIE}. Visually we can notice that in most cases the
estimate is a plausible explanation of the segmented silhouette.
However, there are also a few inaccurate reconstructions.
%as seen in the fourth row, columns 1 and 5.
%% SS: (figure changed, so these numbers are incorrect)

In general, it is expected that the model cannot perform well
in all configurations (this is true for almost any machine
learning model) due to the following reasons:
\begin{enumerate}
\item The proposal distribution $q(\mb{h}|\mb{x})$ does not resemble
the true posterior distribution $p(\mb{h}|\mb{x})$ at the particular
$\mb{x}=\mb{x}^*$: learning is the result of optimizing an {\it
expected} or average error.
\item The real hand and synthetic hand model features are similar
but not the same.  Anthropometric differences can influence
inference accuracy.
\item Even the best model could fail in some configurations.
Information theory tells us that this is always the case except
when the {\it information} in the features is equal to the entropy
of the body pose configurations; in other words, when features
tell us everything needed about the configuration. Otherwise,
there might be multiple explanations for a given visual feature
vector.
\end{enumerate}

In order to test the ability of the system to provide these multiple
explanations, we tested the Multiple Samples (MS) approach. Fig.\
\ref{fig:Real90TestII} shows the estimates found using MS. These
estimates can be interpreted as possible hypotheses of hand
configurations given the silhouettes. \changed{Note that MS tends to
bias the hypotheses towards samples from the distribution
$q(\mb{h}|\mb{x}^*)$, but we can account for this when building a full
probability distribution, as explained in Sec.~\ref{sec:GenInf}}

\psfigurepath{./figs/RealResultsHand2}
\begin{figure*}[ht]
\centerline{\small \begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Subsampled1_F71_770.rle.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00046.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00046.eps,width=0.61in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00096.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00096.eps,width=0.61in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00121.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00126.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00131.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00136.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00141.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00146.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00121.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00126.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00131.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00136.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00141.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00146.eps,width=0.61in,clip=t} \\
\out{
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00151.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00156.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00161.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00166.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00171.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00176.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00181.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00186.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00191.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00196.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00151.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00156.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00161.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00166.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00171.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00176.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00181.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00186.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00191.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00196.eps,width=0.61in,clip=t}\\}
\end{tabular}
} \mycaption{Hand pose estimates in real video sequences (RV)
using the Mean Output algorithm (MO).}{\small \CapRTestIE}{}
\label{fig:Real90TestIE}
\end{figure*}

\psfigurepath{./figs/RealResultsHand2}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{ccccccc}
RV & MO& S1& S2& S3 & S4 & S12 \\
%
\psfig{figure=Subsampled1_F71_770.rle.00010.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00010.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00019.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00019.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00028.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00028.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00037.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00037.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00046.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00046.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00055.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00055.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00073.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00073.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00082.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00082.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_012.eps,width=0.61in,clip=t}\\
\end{tabular}}
 \mycaption{Hand pose estimates in real sequences using multiple
sampling algorithm}{\small \CapRTestII}{} \label{fig:Real90TestII}
\end{figure*}


\subsection{3D Hand Pose Reconstruction Given an Unrestricted Camera Viewpoint}
\label{sec:3DHan}


Our approach is now tested in the task of recovering 3D human hand pose
from an unknown camera viewpoint. For training, we used the {\it
Hand-All-Views} dataset, which contains a total of approximately
750,000 examples. Of these, 18,000 were used for training and the
rest for testing. The input-output pairs were then defined as
follows.  The input consisted of seven Hu moments computed from
the silhouette of the hand, as described in Sec.\
\ref{sec:AppsHand}. The output consisted of 20 internal joint
angles of the hand and two orientation angles. This 22 DOF
representation was linearly encoded by nine values using PCA.

The number of \changed{mixture components} (mapping functions) was
set to 45.  This number was determined via the MDL criterion, as
before (testing for the best MDL score using a model with 35,37,...,51
functions). Each  function was a MLP with seven hidden
nodes.

\subsubsection{Quantitative Results}

As before, we computed the absolute error in estimating hand pose, and
quantitatively compared this measure across views. Fig.\
\ref{fig:HandPerf1SampleS} shows the error of the most likely estimate
found using the MO approach. From the graphs we see that views towards
the palm of the hand ($90^\circ$) are slightly easier to reconstruct
on average, while the variance seems similar across views. As
expected, the average error is higher than that obtained for the fixed
view hand pose reconstruction experiments.  It seems that for
unrestricted hand views it is a bit advantageous to use the computer
graphics feedback function $\zeta$. This is probably because
estimating this inverse mapping $\hat\zeta$ \changed{(to define the
generative model)} over unrestricted viewpoint is more complicated
than for only frontal hand views (and the mapping is likely to be more
complex also).
%%RChange previous paragraph
%SChange Previous paragraph

Fig.\ \ref{fig:HandPerf1SampleM} shows the results using the MS
approach. Fig.\ \ref{fig:HandPerf1SampleM}(a) shows the error
associated with the best sample. This error behaves very similarly to
the MO error. Fig.\ \ref{fig:HandPerf1SampleM}(b) shows the average
error computed using the best 20 samples. This error is higher than
that of the best sample. Note that this is not an obvious result given
that the best sample is determined without having knowledge of
ground-truth. In fact, if the average error of the best 20 samples
were lower than that of the best sample, then we could infer that our
algorithm is very inaccurate at determining what samples are
better. Thus this result positively endorses our MS algorithm.
%%RRChange  Thus this result positively endorses our MS algorithm.
%%SS: OK

For comparison, we used the ground-truth to select the best sample,
based on minimum error.  In other words, we have an oracle that picks
the sample closest to the ground-truth. The resulting performance
graph is shown in Fig.\ \ref{fig:HandPerf1SampleM}(c).  This
represents the lower-bound on the reconstruction error using the
learned forward model. The graph is interesting in the sense that it
separates the errors from the forward and feedback models.

%ATT!!!!
%The feedback model produces a RMSE $< 0.35$ across views. This is
%roughly half the total RMSE error produced by our method overall.
\psfigurepath{./figs}
\begin{figure}[t]
\centerline{(a)
\psfig{figure=GraphViewsRes_softH2V31-7_7_GR7i.mat.eps,width=1.8in,clip=t}
%GG_Res_softH2V31-7_7_GR7i.mat.EType_0.eps,width=3in,clip=t}
~~~(b)
\psfig{figure=GraphViewsRes_softH2V31-7_7_R7iE.mat.eps,width=1.8in,clip=t}
%\psfig{figure=GG_Res_softH2V31-7_7_R7iE.mat.EType_0.eps,width=3in,clip=t}
} \mycaption{Unrestricted view model performance using Mean Output
(MO) and $\hat{\zeta}$}{\small Mean Output (MO) inference
performance for unrestricted view tests at given viewpoint
latitudes (averaging over longitude). The feedback function is (a)
the estimated $\hat\zeta$ (b) the computer graphics rendering
$\zeta$. A frontal view of the hand palm is at latitude
$\beta_1=\pi/2$ , longitude $\beta_2=0$. For reference, the performance of an algorithm that chooses the estimate at random from the training data is shown. The angle range is in average 1.87 radians}{}
\label{fig:HandPerf1SampleS}
\end{figure}
%%RChange previous figure
%SChange previous caption

\psfigurepath{./figs}
\begin{figure}[t]
\centerline{\small (a)
\psfig{figure=GraphViewsMS1.eps,width=1.8in,clip=t}
%GG_Res_softH2V31-7_7_R7iM.mat.EType_1.eps,width=2.0in,clip=t}
\small (b)
\psfig{figure=GraphViewsMS2.eps,width=1.8in,clip=t}
%\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_2.eps,width=2.0in,clip=t}
\small (c)
\psfig{figure=GraphViewsMS3.eps,width=1.8in,clip=t}
%\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_3.eps,width=2.0in,clip=t}
} \mycaption{Unrestricted view model performance using multiple
sampling and $\hat{\zeta}$}{\small Multiple Samples (MS) inference
for unrestricted view tests at given viewpoint latitudes
(averaging over longitude).  Feedback functions is the estimated
$\hat{\zeta}$. A frontal view to the hand palm is at latitude
$\beta_1=\pi/2$ , longitude $\beta_2=0$. (a) Most probable sample.
(b) Average over all samples (20 most probable samples taken). (c)
Best sample (determined using ground-truth information for
comparison). For reference, the performance of an algorithm that chooses the estimate at random from the training data is shown. The angle range is in average 1.87 radians}{} \label{fig:HandPerf1SampleM}
\end{figure}

\subsubsection{Experiments with Real Images}
\label{sec:HAnyRealImgs}

We test our approach using video of hands (in any orientation)
collected from a single uncalibrated camera. Pose estimates from 40
frames (taken every 0.9 secs apart) obtained via the MO approach are
shown in Fig.\ \ref{fig:RealAnyTestIE}. Note that there are
incorrectly-segmented hands in this sequence. We decided to leave
these in to avoid frame rearrangements (losing the uniform frame
sampling), to show that segmentation does not always work correctly,
and to show that this approach is inherently robust to extreme
segmentation errors.  In this experiment, there was usually visual
agreement between reconstruction and estimate as seen in the
figure. Note that even for a human observer, looking at the segmented
silhouettes in the figure, reconstruction is sometimes
ambiguous. There are also some configurations for which the system did
not perform correctly.

Fig.\ \ref{fig:RealAnyTestIIE} shows the estimates obtained via the MS
approach. The frames shown were taken approximately every 0.9
seconds. In the second row, we can see some limitations of the Hu
moment feature space: sometimes, different hand orientations are very
similar in the feature space. These apparently different hypotheses
are close to each other in terms of their probability, given the
features. The same effect repeats clearly in the third and sixth
row. This problem might be alleviated by using a different input
feature space. At an extreme one might consider the full silhouette as
a feature. Of course there are important trade-offs to take into
account when considering different features; e.g., invariants, and
dimensionality.

\psfigurepath{./figs/RealResultsH2Unr}
\begin{figure*}[ht]
\centerline{\small
\begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00046.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00001.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00006.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00011.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00016.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00021.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00026.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00031.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00036.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00041.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00046.eps,width=0.61in,clip=t} \\
\out{
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00096.eps,width=0.61in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00051.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00056.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00061.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00066.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00071.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00076.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00081.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00086.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00091.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00096.eps,width=0.61in,clip=t} \\}
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00121.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00126.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00131.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00136.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00141.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00146.eps,width=0.61in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00101.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00106.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00111.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00116.eps,width=0.61in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00121.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00126.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00131.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00136.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00141.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00146.eps,width=0.61in,clip=t}\\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00156.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00161.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00166.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00171.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00176.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00181.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00186.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00191.eps,width=0.61in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00196.eps,width=0.61in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00156.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00161.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00166.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00171.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00176.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00181.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00186.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00191.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00196.eps,width=0.61in,clip=t}\\
\end{tabular}}
 \mycaption{Estimated hand poses from real sequences using  Mean
Output (MO) algorithm and $\zeta$}{\small \CapRTestIE}{}
\label{fig:RealAnyTestIE}
\end{figure*}


\psfigurepath{./figs/RealResultsH2UnrM}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{ccccccc}
RV & MO & S1 & S2 & S3 & S4 & S12 \\
%
\psfig{figure=Subsampled1_F771_1269.rle.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00151.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00160.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00160.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00169.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00169.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00178.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00178.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00187.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00187.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00196.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00196.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00214.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00214.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00223.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00223.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00064.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_012.eps,width=0.61in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00091.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_001.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_002.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_003.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_004.eps,width=0.61in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_012.eps,width=0.61in,clip=t}\\
%              Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_012
\end{tabular}} \mycaption{Estimated hand poses from real video (RV) sequences using
Mean Output (MO) and Multiple Samples (MS) inference.}{\small \CapRTestIIE}{}
\label{fig:RealAnyTestIIE}
\end{figure*}

\subsection{2D Human Body Pose Reconstruction}\label{sec:2DHum}

In order to show that our approach can be employed, with no change, to
perform other similar tasks (possibly with a different
representation), here we now conduct performance tests in the task of
estimating human body pose from a single image. The goal is to
estimate the 2D locations of body markers in the image, given visual
features computed from the person's silhouette. In this experiment, we
use the {\it Body-All-Views} dataset, which contains a total of of
over 100,000 samples.  Of these, 8,000 were used for training and the
rest for testing. The input-output pairs were defined as follows.  The
input consisted of the 10 Alt moments computed from the
silhouette. The output consisted of 20 2D marker positions (40 DOF),
which were then linearly encoded by nine values using PCA.

The number of \changed{mixture components in the discriminative model}
was set to 15. This number was determined via the MDL criterion,
exactly as before. Each function is a MLP with seven hidden nodes.

\subsubsection{Quantitative Results}

Fig.\ \ref{fig:ArtC} shows the reconstruction obtained with the MO
approach for frames taken from three synthetic sequences
excluded from the training set.
The agreement between reconstruction and observation is easy to
perceive for all frames. Also, for self-occluding configurations,
the estimate is still similar to ground-truth.
%It is important to
%remark that no human intervention nor pose initialization was
%required.
%RRChange, It is important to remark that
%SS: Removed. This is redundant.  I can't point to at least
% two other places
% in the paper where you say this already.

Fig.\ \ref{fig:ArtCP} shows the average marker error and variance per
body orientation in percentage of body height. Note that the error is
bigger for orientations closer to $0$ and $\pi$ radians.  This
intuitively agrees with the notion that at those angles (side-views),
there is less visibility of the body parts.  We consider this
performance promising, given the complexity of the task and the
simplicity of the approach. Just as a reference point, by choosing
poses at random from those in the training set, the RMSE was 10.35\%
of body height (with a standard deviation of 4.4\%). In related work,
quantitative performance has usually been ignored, in part due to the
lack of ground-truth and standard evaluation datasets.
%SChanged above paragraph

%% \begin{figure}[h]
%% \parbox[c]{0.615\textwidth}{
%% \psfigurepath{../NIPS01/epsArt}
%% \centerline{GT
%% \psfig{figure=ArtSil_00000.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00001.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00002.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00019.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00023.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000000-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000001-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000002-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000019-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000023-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } \vspace{-.2in}\rule[.0in]{4.0in}{0.01in}
%% \psfigurepath{../NIPS01/epsArt2}
%% \centerline{GT
%% \psfig{figure=ArtSil_00004.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00005.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00006.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00007.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00009.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00013.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000004-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000005-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000006-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000007-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000009-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000013-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } \vspace{-.4in}\rule[.0in]{4.0in}{0.01in} \centerline{GT
%% \psfig{figure=ArtSil_00035.Art40.eps,width=0.65in,clip=t}
%% \psfig{figure=ArtSil_00036.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00041.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00045.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00049.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000035-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000036-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000041-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000045-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000049-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } } \hfill
%% \parbox[c]{0.42\textwidth}{
%% \psfigurepath{../NIPS01/eps}
%% \centerline{\psfig{figure=ViewPointTest.eps,width=0.4\textwidth,clip=t}}}
%% \mycaptionS{\small Left: Example reconstruction of several test
%% sequences with CG-generated silhouettes. Each set consists of
%% input images and reconstruction (every 5th frame). Right: Marker
%% root-mean-square-error and variance per camera viewpoint (every
%% $2\pi/32$ rads.). Units are percentage of body height. Approx.
%% 110,000 test poses were used. } \label{fig:ArtC}
%% \end{figure}


\psfigurepath{../NIPS01/epsArt/}
\begin{figure}[t]
\centerline{\small
\begin{tabular}{rccccccccc}
GT &
\psfig{figure=ArtSil_00000.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00001.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00002.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00019.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00023.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00004.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00005.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00006.Art40.eps,width=0.65in,clip=t} \\
MO &
\psfig{figure=000000-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000019-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000023-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000004-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000005-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000006-431602080.Art40.tif.eps,width=0.65in,clip=t} \\
\\
\hline
\\
GT &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00007.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00009.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00013.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00035.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00036.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00041.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00045.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00049.Art40.eps,width=0.65in,clip=t} \\
MO &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000007-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000009-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000013-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000035-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000036-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000041-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000045-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000049-431602080.Art40.tif.eps,width=0.65in,clip=t} \\
\end{tabular}
}\hfill \mycaptionS{\small Example reconstruction of frames from 
test sequences with computer graphics-generated silhouettes.
%Each %several
%set consists of input images and reconstruction.
% (every 5th frame).
} \label{fig:ArtC}
\end{figure}


\psfigurepath{../NIPS01/eps}
\begin{figure}[t]
\centerline{\psfig{figure=ViewPointTest.eps,width=0.3\textwidth,clip=t}}
\mycaptionS{\small Root mean-square-error (divided by number of
markers) and variance per camera viewpoint (every $2\pi/32$
rads.). Units are percentage of body height. Approx.  110,000 test
poses were used.}
\label{fig:ArtCP}
\end{figure}


\subsubsection{Experiments with Real Images}

We now test the approach using real video sequences of human body
motion. We use the basic segmentation approach described in
Sec.~\ref{sec:BodyDet} to obtain silhouettes. Fig.~\ref{fig:ExampR0}
shows examples of system performance obtained via the MO approach for
several relatively complex motion sequences. Even though the
characteristics of the segmented body differ from the ones used for
training, good performance is still achieved. Most reconstructions are
visually close to what can be thought of as the right pose
reconstruction. Body orientation is also accurate. \changed{In the
Figure, we can see two particularly difficult configurations at the
second row of real video (RV) images, fourth-sixth columns; the arm
configuration is difficult to estimate}. \changed{ This could be due
to the lack of relevant training data, as a consequence the
discriminative model $q$ may not approximate the generative model $p$
very well around the input vector. In general, an important issue to
keep in mind is that the visual differences between the rendered model
and the real body observed could become critical and thus accurate
rendering may be desirable. This varies from application to
application; however in any case the general inference approach
presented here remains the same.}

%We used 60 specialized functions, each one was a MLP with five
%hidden nodes.
\hide{ Fig.\ \ref{fig:RealBodyMS} shows the top-ranked pose samples
obtained via the MS approach.  Note that despite low-quality
segmentation, the system outputs reasonably accurate pose
hypotheses. Orientation is accurate and the relative limb
relationships are maintained. However, we can observe that some poses
are inherently difficult and the estimate lacks enough pose detail to
be perceived as a good estimate. For example, the eighth row shows a
side view of a person raising one arm while keeping the other arm at
rest. The resulting MS estimates all show a side-view, however none
has the correct arm configuration.  } 

%One difference with respect to the hand pose estimation task is
%that the rendering quality or realism for body pose is poorer for
%the human body renderer.
In this work, we did not pursue use of a more realistic human body
renderer. Due to differences in shape and width of body components
observed in training versus testing, the visual features may differ.
This is a relevant point since in almost all learnig models, it is
expected that the training data be a good approximation to the real
test data. Improving the match between visual features used in
training and testing, and thus potentially the overall performance, is
an area that we plan to investigate in future
research. \changed{Despite the fact that we have ignored differences
in anthropometric characteristics between CG and real silhouettes, the
performance observed for both articulated objects (hands - human
bodies) is excellent given that only a single image is assumed
available.}

%In theory this could allow us to adapt our algorithm to different body
%or hand anthropometric characteristics.

%\hide{
\psfigurepath{../NIPS01/eps}
\begin{figure}[h]
\centerline{\small \begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Sil_00001.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00003.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00004.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00005.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00000.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00001.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00000.3.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00001.3.eps,width=0.65in,clip=t}\\
MO &
\psfig{figure=000001-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000003-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000004-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000005-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000000-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000000-431602080.3.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.3.tif.eps,width=0.65in,clip=t}\\
\\
\hline
\\
RV &
\psfig{figure=Sil_00001.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00003.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00004.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00005.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00006.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00007.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00008.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00008.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00010.4.eps,width=0.65in,clip=t} \\
MO &
\psfig{figure=000001-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000003-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000004-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000005-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000006-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000007-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000008-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000009-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000010-431602080.4.tif.eps,width=0.65in,clip=t}\\
\end{tabular}
} \caption{\small Reconstruction obtained from observing a human
subject (every 10th frame).} 
\label{fig:ExampR0}
\end{figure}

\hide{
\psfigurepath{./figs/ResRealBodyMS}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{cccccc}
RV & S1 & S2 & S3 & S4 & S12 \\
%
\psfig{figure=Sil_00001.1.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.1.tif.eps,width=0.65in,clip=t} \\
%
\psfig{figure=Sil_00003.1.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S002.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S005.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S008.1.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00005.1.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S003.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S005.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S011.1.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.2.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S010.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S011.2.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.3.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S003.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S004.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S005.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.3.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.4.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00003.4.eps,width=0.65in,clip=t}&
\psfig{figure=00002_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00002_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S007.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S008.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S010.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00005.4.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S007.4.tif.eps,width=0.65in,clip=t} \\
%
\psfig{figure=Sil_00007.4.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S005.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00009.4.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S010.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00010.4.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S006.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00012.4.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S007.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S009.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S012.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.5.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S004.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S011.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S012.5.tif.eps,width=0.65in,clip=t}\\
\end{tabular}}
\mycaptionS{Estimated body poses from real sequences obtained via
MS inference.}\label{fig:RealBodyMS}
\end{figure*}
}

\section{Conclusions}

\label{sec:Dis}

\changed{ In this paper, we have described a novel method that allows
us to infer 3D and 2D articulated body pose from observed
visual features in a single image, a problem usually regarded as
ill-posed. This was done by combining generative and discriminative
models to solve the complex probabilistic inference problem. This
approach is most useful when the generative model is accurate (\eg we
have an inverse mapping function) but it is difficult to perform
inference using this model alone.}

\changed{
In order to solve the inference problem (and also perform MAP
estimation), we have shown that a mathematically sound approach is to
use a discriminative model and learn its parameters using relevant
training data. The probability distribution implied by the
discriminative model can be used as a proposal distribution to
generate samples and find a posterior probability distribution
(perform approximate inference) under the (accurate but complex)
generative model.}


%q=p, then done???? OIK

%% In this paper, we have described a novel method that allow us to
%% combine generative and discriminative models for proabbilistic
%% inference. The SMA employs a set of several mapping functions that are
%% learned from training data. Each specialized function maps certain
%% domains of the input space onto the output space. The SMA learning
%% formulation uses ideas from Maximum Likelihood estimation and latent
%% variable models. A variant of the Expectation-Maximization algorithm
%% is used for simultaneous learning of the specialized domains along
%% with the mapping functions. One key advantage of the SMA is that it
%% can model ambiguous, one-to-many mappings that may yield multiple
%% valid output hypotheses.
%% %Once learned, the mapping
%% %functions generate a set of output hypotheses for a given input
%% %via a statistical inference procedure.

%% Another key advantage of the SMA formulation is its incorporation
%% of a feedback or inverse function, $\zeta$ in statistical
%% inference. 
%% %if
%% %desired.}
%% %To the best
%% %of our knowledge, we do not know of any other probabilistic
%% %formulation undertaking these ideas.
\changed{ When comparing it to other relevant methods, we can find
alternative (dual) interpretations of this framework. The use of a
generative model (through $\zeta$) affords an alternative to complex
discriminative models; for example, it is an alternative to the gating
networks of the Mixture of Experts paradigm \cite{Jordan94}. In
general, instead of learning increasingly complex discriminative
models such as \cite{Hinton98,Friedman91}, we can exploit an accurate
generative model and learn a simpler discriminative model.}

\out{The discriminative
model in our approach assumes that the mixing factors are independent
of the input, as seen in Sec.\ \ref{sec:ProMod}. At first sight, this
seems to limit the architecture's expressiveness. However, the
combination of discriminative (also referred here as 'forward') and
generative models eliminates this independence assumption. In other
words, the generative model $\zeta$ provides an alternative that
avoids increasing the discriminative model complexity without
restricting model expressiveness.}

%%  Note that
%% in our formulation formulation, different sets of appropriate
%% conditional independence assumptions are specified by the forward and
%% inverse models. 
%% In
%% applications such as those presented in this paper, $\zeta$ can be a
%% computer graphics rendering function or an approximation $\hat{\zeta}$
%% can itself be learned from training data.  Thus, the SMA exploits
%% available prior information about the structure of the problem.


%%RRChange This allowed us... [very important]
%%SS: OK.  I reworded slightly to make it clearer/shorter.
%%RR: If you remove 'if desired' it would be OK, since SMA needs them both
%%SS: Hope it's OK with you.
%%RR:
%% I would prefer this, hope it is clear what's the point from my email
%% 'Another key advantage of the SMA formulation is its incorporation of a
%% feedback or inverse function, $\zeta$ in statistical inference. This
%% allowed us to derive an inference method was based on the possibility
%% of alternatively use different sets of conditional independence
%% assumptions specified by the forward and inverse models'
%%RRChange To the best of our knowledge, we do not know of any other probabilistic formulation undertaking these ideas. [I think we should emphasize the novelty here]
%%SS: I removed this.  added word ``novel'' in prior sentence.
%%RR: Do you think it is too risky to say that? or why did you remove it?
%%RRChange [deleted] ....learned from training data \footnote{It is important to add that the use of $\zeta$ does not limit the possibility of having multi-modal posteriors over $\mb{x}$.}
%%SS: OK

Our approach was demonstrated in a computer vision system that can
estimate the articulated pose parameters of a human body or human
hands, given features computed from a single image.  This is a
particularly difficult problem because this mapping is highly
ambiguous, complex and it is infeasible to perform inference using the
discriminative model.  We have obtained promising results even using a
very simple set of image features, such as moment invariants of the
body silhouette.  Choosing the best subset of image features for this
application is by itself a complex problem, and a topic of ongoing
research.

This approach offers several advantages over many previous methods for
articulated pose estimation. These have tried in numerous ways to use
camera geometry and/or model registration to perform pose estimation,
resulting in iterative procedures that require careful choice of
initial conditions (model placement). We have shown how in some cases
these alternative approaches could be seen as inferring a posterior
distribution using the generative model only. In this approach no
iterative minimization methods are used in pose inference. Moreover,
inference is fully automatic -- no manual initialization of the
articulated model is required.  Another set of previous approaches
attempt to learn articulated model dynamics
\cite{Brand99,Howe99,Perona00}; however, learning dynamics requires
substantially more training data, and tends to produce systems that
are biased towards specific motions. Our framework avoids this and
infer pose from a single image only.

Applications need not be limited to the vision domain. As a simple
example, one could apply this approach in speech recognition problems,
where the input space is given by features computed on acoustic
signals (\eg cepstral coefficients), and the output space could be the
space of phonemes. In this case, the generative model (feedback
function) would involve an acoustical rendering of phonemes.

Several interesting problems remain for future work.  Within the
context of articulated pose estimation. For example, (1) adapt the
system to a specific body morphology, one of the major issues
affecting performance and (2) integration of pose estimation with
image segmentation for potentially greater robustness to occlusion and
noise. Methods for incorporating knowledge of dynamics in the same
framework should be investigated, as discussed in
\cite{RosalesPhDThesis}. Another general problem is how to learn what
the best (\eg visual) features are for specific problems or
datasets. While promising advances have been made, extension of our
framework to incorporate such concepts remains a topic for future
investigation.
%%% Adaptive extra learning in the q model

\section*{Acknowledgments} The hand sequences used in our
experiments were collected in collaboration with Vassilis Athitsos.
We thank Tommi Jaakkola, Quaid Morris, and Matt Brand for suggestions
and interesting discussions. This research was supported in part by
the U.S.\ Office of Naval Research under grants N000140310108 and
N000140110444, and the U.S.\ National Science Foundation under grants
IIS-0208876 and IIS-9809340.

\renewcommand{\baselinestretch}{1}
\bibliography{thesis}

\end{document}
