\documentclass[11pt]{article}
\bibliographystyle{plain}
\usepackage{amssymb}
\usepackage{times}
%\usepackage{doublespace}
\thispagestyle{empty}
\newcommand{\mse}{mean-square error }
%TPAMI-0029-0403
\newcommand{\hide}[1]{}

\newcommand{\ui}{^{(i)}}
\newcommand{\us}{^{(s)}}
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\beqa}{\begin{eqnarray}}
\newcommand{\eeqa}{\end{eqnarray}}

\newcommand{\ie} {{\it i.e., }}
\newcommand{\eg} {{\it e.g., }}

\newcommand{\cl}[1]{{{\cal{#1}}}}

\newcommand{\mr}[1]{{\mathrm{#1}}}
\newcommand{\mb}[1]{{\mathbf{#1}}}


\newcommand{\mycaption}[3]{\renewcommand{\baselinestretch}{1}\caption[#1]{#2.}{#3}\renewcommand{\baselinestretch}{1.5}}

\newcommand{\mycaptionS}[1]{\renewcommand{\baselinestretch}{1}\caption[#1]{\small #1}\renewcommand{\baselinestretch}{1.5}}

\newcommand{\CapMViewc}{{Estimating same hand pose at $26$ viewpoints. The feedback function used was estimated from data. The figure has two sets of columns. Each column has the ground truth, MO, and best three MS samples. The viewpoint $(\beta_1,\beta_2)$ is indicated on the right side of each column}}


\newcommand{\CapMViewb}{{Example estimated hand poses at random view points obtained using the MS algorithm. Feedback function was estimated from data. Columns 1-2 show the ground truth and the estimate using the MO algorithm, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

\newcommand{\CapMViewcE}{{Estimating same hand pose at $26$ viewpoints. The feedback function used was the computer graphics rendering. The figure has two sets of columns. Each column has the ground truth, MO, and best three MS samples. The viewpoint $(\beta_1,beta_2)$ is indicated on the right side of each column}}

\newcommand{\CapMViewbE}{{Example estimated hand poses at random view points obtained using the MS approach. Feedback function was computer graphics rendering. Columns 1-2 show the ground truth and the estimate using the MO algorithm, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

%-----

\newcommand{\CapTestI}{{40 examples of estimated hand poses chosen uniformly at random. Reconstruction found using the Mean Output (MO) approach. The feedback function used was estimated from data. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}

\newcommand{\CapTestIE}{{40 examples of estimated hand poses chosen uniformly at random. Reconstruction found using the Mean Output (MO) approach. The feedback function was computed using computer graphics rendering. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom). For comparison, the frames  are the same as those used when feedback was estimated from data}}

\newcommand{\CapRTestI}{{40 examples of estimated hand poses captured every 0.9 secs. from real video (RV). Reconstruction found using the Mean Output (MO) approach. The feedback function used was estimated from data. }} %Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}

\newcommand{\CapRTestIE}{{40 examples of estimated hand poses captured every 0.9 secs from real video (RV). Reconstruction found using the Mean Output (MO) approach. The feedback function was computed using computer graphics rendering}} %Each example consists of a pair of images: input video frame (top), and estimate obtained using the mean output algorithm (bottom).  Note: for comparison frames are same as those used when feedback was estimated from data}}

\newcommand{\CapTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function  was estimated from data. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\newcommand{\CapTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was computed using computer graphics rendering. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

\newcommand{\CapRTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach using real video (RV). The feedback function was estimated from data}} %%. Frames were chosen every 0.9 secs. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}

\newcommand{\CapRTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach and real video (RV). The feedback function was computed using computer graphics rendering}} %%. Frames were chosen every 0.9 secs. Columns 1-2 show the input video frame and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\newcommand{\CapTestIIWE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. The feedback function was computed using computer graphics rendering. Column 1 shows ground truth, columns 2-6 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}



%%%% ----- Multiple

\newcommand{\CapMTestI}{{40 examples of estimated hand poses chosen uniformly at random and reconstruction found using Mean Output (MO) approach. The feedback function used was estimated from data. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom)}}

\newcommand{\CapMTestIE}{{40 examples of estimated hand poses chosen uniformly at random and reconstruction found using Mean Output (MO) approach. The feedback function was computed using computer graphics rendering. Each example consists of a pair of images: ground-truth (top), and estimate obtained using the mean output algorithm (bottom). Note: for comparison frames are same as those used when feedback was estimated from data}}


\newcommand{\CapMTestII}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. Views and poses were chosen uniformly at random. The feedback function was estimated from data. Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\newcommand{\CapMTestIIE}{{Example estimated hand poses obtained using the Multiple Sample (MS) approach. Views and poses were chosen uniformly at random. The feedback function was computed using computer graphics rendering.  Columns 1-2 show the ground truth and the MO solution, columns 3-7 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}



\newcommand{\CapRTestBodyII}{{Example estimated bodyposes obtained using the Multiple Sample (MS) approach using real video (RV). The feedback function was estimated from data. Frames were chosen every $\frac{2}{3}$ secs. Column 1 shows the input video frame, columns 2-6 show sample 1-4 and 12 obtained via the MS approach (S1 is the most probable sample)}}


\usepackage{psfig}
%\pssilent
%\renewcommand{\psfig}[1]{}

\renewcommand{\dbltopfraction}{1.0}
\renewcommand{\textfraction}{0.0}
\renewcommand{\topfraction}{1.0}
\renewcommand{\bottomfraction}{1.0}
\renewcommand{\baselinestretch}{1.5}
\def\changed#1{{\bf{#1}}}
\psdraft
\newcommand{\comment}[1]{{\large\it #1}}
\setlength{\textwidth}{6.5in} \setlength{\textheight}{9.0in}
\setlength{\oddsidemargin}{0.0in} %\setlength{\topmargin}{-0.5in}

\begin{document}
\thispagestyle{empty}
\title{A generative/discriminative framework for estimating articulated body pose from a single image}
\title{\vspace*{-1in}{\small Boston University Computer Science
Technical Report 2003-007, \today.  Submitted to IEEE PAMI.}\\
~\\
~\\
~\\
A generative/discriminative framework for estimating articulated body pose from a single image}

\author{\begin{tabular}{cc}
R\'{o}mer Rosales  & Stan Sclaroff\\
Probabilistic and Statistical Inference Group& Image and Video
Computing Group \\
Dept.\ of Electrical and Computer Engineering & Dept.\ of Computer
Science
\\
 University of Toronto & Boston University \\
 Toronto, ON M5S 3G4 CANADA & Boston, MA 02215 USA\\
romer@psi.toronto.edu & sclaroff@cs.bu.edu\\
\end{tabular}}

\date{~}
%\date{Version of \today}
\renewcommand{\baselinestretch}{1.}
\maketitle \thispagestyle{empty}
\renewcommand{\baselinestretch}{1.5}

%% \begin{abstract}
%% A probabilistic, nonlinear supervised learning model is proposed:
%% the Specialized Mappings Architecture (SMA).  The SMA employs a
%% set of several mapping functions that are estimated automatically
%% from training data. Each specialized function maps certain domains
%% of the input space (e.g., image features) onto the output space
%% (e.g., articulated body parameters). One important advantage of
%% the SMA is that it can model ambiguous, one-to-many mappings that
%% may yield multiple valid output hypotheses. Once learned, the
%% mapping functions generate a set of output hypotheses for a given
%% input via a statistical inference procedure. The SMA inference
%% procedure incorporates an inverse mapping or feedback function,
%% which enables the SMA to evaluate the likelihood of each
%% hypothesis. Possible feedback functions include computer graphics
%% rendering routines that can generate images for given hypotheses.
%% The SMA employs a variant of the Expectation-Maximization
%% algorithm for simultaneous learning of the specialized domains
%% along with the mapping functions, and approximate strategies for
%% inference. The framework is demonstrated in a computer vision
%% system that can estimate the articulated pose parameters of a
%% human body or human hands, given image silhouettes. The accuracy
%% and stability of the SMA are also tested using synthetic images of
%% human bodies and hands, where ground truth is known.

%% \end{abstract}

%which allowed us to derive inference
%methods based on the possibility of alternatively use different sets
%of conditional independence assumptions specified by the forward and
%inverse models. The inverse function

\begin{abstract}

A probabilistic, nonlinear supervised learning model is proposed: the
Specialized Mappings Architecture (SMA).  The SMA employs a set of
several forward mapping functions that are estimated automatically
from training data. Each specialized function maps certain domains of
the input space (e.g., image features) onto the output space (e.g.,
articulated body parameters). The SMA can model ambiguous, one-to-many
mappings that may yield multiple valid output hypotheses. Once
learned, the mapping functions generate a set of output hypotheses for
a given input via a statistical inference procedure. The SMA inference
procedure incorporates an inverse mapping or feedback function in
evaluating the likelihood of each of the hypothesis. Possible feedback
functions include computer graphics rendering routines that can
generate images for given hypotheses.  The SMA employs a variant of
the Expectation-Maximization algorithm for simultaneous learning of
the specialized domains along with the mapping functions, and
approximate strategies for inference. The framework is demonstrated in
a computer vision system that can estimate the articulated pose
parameters of a human's body or hands, given silhouettes from a single
image. The accuracy and stability of the SMA are also tested using
synthetic images of human bodies and hands, where ground truth is
known.
%\hide{In the SMA
%formulation it is possible to use different sets of conditional
%independence assumptions in the forward and inverse models if
%desired.}
%In both
%tests, excellent performance is attained.
%SSChanged: Commented out last sentence...
%% It's just begging for abuse from the reviewers.
%% Let the reader be the judge please.
%% RR:OK
%% Reworded the abstract a little.  The abstract already says what's important.
%% One need not say things like "An important aspect of the approach...."  etc.
%% RR: After the clarification at the beginning of my email, you'll see that this is not correct:
% 'In the SMA formulation
%it is possible to use different sets of conditional independence
%assumptions in the forward and inverse models if desired.'
% That's why I prefer:
%'It incorporates an inverse
%mapping or feedback function, which allowed us to derive inference
%methods based on the possibility of alternatively use different sets
%of conditional independence assumptions specified by the forward and
%inverse models. The inverse function enables the SMA to evaluate the
%likelihood of each of the hypothesis.'
%% The knowledge of the inverse function allowed us to use both sets of CIA's (at the same time for interence).
%% Otherwise SMA would have been like most ML methods
\end{abstract}

\paragraph{Keywords:} Supervised learning, statistical inference,
mixture models, Expectation Maximization algorithm, articulated
structure estimation, human body pose, hand shape.

\newpage




%%%%%%%%%%%\renewcommand{\psfig}[1]{}
\section{Introduction}

A fundamental task for vision systems is to infer the state of the
world given some form of visual observations. From a computational
perspective, this often involves facing an ill-posed problem; for
example, relevant information may be lost via projection of the
three-dimensional world into a two-dimensional image. As a result, it
is often the case that multiple valid interpretations of an image are
possible.  Solving an ill-posed problem requires some form of
additional information, usually provided as a model of the underlying
process. Interestingly, in their day to day life, humans are
surprisingly adept at interpreting the visual world, despite the
ill-posed nature of the problem.

One essential vision problem is that of inferring or estimating the
underlying 3D attributes of a real world object, based on its 2D
projection onto a camera. In this paper we will focus on non-rigid
articulated objects, in particular on human body pose and also hand
configuration. 

Humans can easily estimate the articulated pose and motion of people
in a scene, given only relatively low-resolution, monocular images of
the world, e.g., from a photograph or a video.  It is believed that
humans employ extensive prior knowledge about human body structure and
motion in this task \cite{Johansson73}.  Assuming this, in this paper
we will consider how a computer might learn the underlying {\it
knowledge} in the form of a probabilistic model, and thereby infer
pose from a single image.

%% Don't know what this figure refers to
%% \begin{figure}[p]

%% \vspace{1.5in} \mycaptionS{\small Example ill-posed inference
%% task. Given only a person's silhouette (a), approximately infer:
%% (b) the projected 2D locations of the person's joints in the
%% image, or (c) the 3D locations of the person's joints in Euclidean
%% space.\label{fig:exampleTask}}
%% \end{figure}


Let us consider an example body pose inference task: given only a
person's silhouette, estimate that person's articulated body pose.  To
be concrete, let us define articulated pose in terms of: (a) the 2D
locations of the person's joints in the image, or (b) the 3D locations
of the person's joints in Euclidean space. Imagine drawing marks on
the silhouette image that approximately label the joints: left elbow,
right elbow, left knee, right knee, and so on. Also consider a
plausible 3D pose interpretation for this silhouette.  While this
inference task seems relatively simple for a human to perform, the
task is quite challenging, using either representation (a) or (b), for
current computer vision systems.
%RRChange ...  using either representat
%SS: OK
% An example image is shown in Fig.\ \ref{fig:exampleTask}.
\changed{For purposes of computation, the above task can be defined as
follows: given an observation vector $\mb{x}\in \Re^c$ that was
extracted from an image of a person, infer the parameterized
articulated pose as a vector $\mb{h} \in \Re^t$}. Assume these input
and output \changed{vector} spaces ${\Re}^c$ and ${\Re}^t$ are
continuous. \changed{In a very generic machine learning framework,
inference might be regarded as a function $\varphi:{\Re}^c
\rightarrow{\Re}^t$ that for a given input (or observation) computes
as output a single pose (\eg the most likely pose according to some
measure) or more generally a pose posterior probability distribution
(the latter would lead to a different definition of $\varphi$). While
the apparent simplicity of this concept is alluring, it leaves a
number of nettlesome open issues: how to select the appropriate type
or form for this function (\eg we may have reasons to use a
discriminative instead of a generative model\footnote{The term {\it
inference} is used mainly in the context of generative models;
however, in this section we consider a broader usage by employing it
in the context of discriminative models also}), how to take advantage
of the problem structure (\eg prior knowledge for modeling), how to
estimate (learn) this mapping from data, and how to perform inference
efficiently or approximately if exact inference is intractable (\eg
how to make use of what was learned from data). These are fundamental
problems in data modeling in general.}
%%RRChange ... a single pose ...
%%SS: OK

\psfigurepath{./figs}
\begin{figure}[t]
\centerline{
\psfig{figure=Intro3w.ps,width=0.6\textwidth,clip=t}
}
\mycaptionS{\small Example ambiguity in mapping
body silhouette cues in ${\Re}^c$ to articulated body poses in
${\Re}^t$. Given silhouette $\mb{x}$, poses $\mb{a}$--$\mb{h}$
are all valid hypotheses.  In general,
entire regions in ${\Re}^t$ may contain valid poses.
\label{fig:mappingAmbiguity}}
\end{figure}

If we try to learn a mapping directly, let us say by estimating the
parameters of a parameterized function $\phi:{\Re}^c
\rightarrow{\Re}^t$ as in a discriminative (botton-up) approach, we
encounter several problems. The form required for $\phi$ may not be
simple, because the mapping from observations (\eg an image) to
articulated poses is generally ambiguous (one-to-many). In fact no
single function can perform this mapping. An example is illustrated in
Fig.~\ref{fig:mappingAmbiguity}, the arm locations cannot be uniquely
inferred given the silhouette $\mb{x}$; therefore, $\mb{a}$--$\mb{h}$
are all possible pose configurations (the arms can move in such a way
that the silhouette does not change). Note also that pose $\mb{c}$ is
the reflection of $\mb{a}$: the camera looks at the back rather than
at the front of the body. There might be an infinite number of valid
poses for a particular input. Moreover, regions of valid poses need
not be connected in $\Re^t$. For instance, different regions in
$\Re^t$ may correspond to ranges of valid poses, \eg some viewed from
the front and others from behind. Such ambiguities are not particular
to human body pose; for instance, analogous inference problems exist
in estimating hand pose from image features, as will be seen
later. Even though one may be tempted to just increase the complexity
of this bottom-up function $\phi$ and consider this choice as
necessary (due to the apparent intricacy of the problem at hand)
\footnote{Moreover, unnecessarily increasing the complexity of $\phi$
can have other awful consequences such as overfitting.}, a fundamental
idea in this paper is that this choice may not be necessary, as will
be seen next.

%Complexity, and doesn't include knowledge
%%RRChange ...In fact
%%SS: OK

Let us now consider the inverse problem: given an articulated pose
vector $\mb{a}$, generate its silhouette $\mb{x}$. With a good
computer graphics model of the human body, one can easily render the
silhouette $\mb{x}$. Thus, we can easily compute what we refer to as
the inverse mapping $\zeta:{\Re}^t\rightarrow{\Re}^c$ (note that
despite the simplicity of $\zeta$, its inverse may still be complex or
not even exist). Other real world problems share the property that
their inverse problem is simpler, e.g., speech recognition (after some
parameters are given, such as pitch). In fact, this property is a key
part of our problem definition and it will play an important role in
developing the framework presented in this paper. The argument is that
the inverse funtion $\zeta$ provides useful information about the
structure of the problem, \changed{ but cannot be incorporated
straightforwardly in a bottom-up discriminative approach or cannot be
use directly for inference. On the other hand, it might be useless in
a purely generative approach (these approaches are related to
tracking): we have a very accurate way to generate silhouettes from a
given pose configuration; however, this does not guarantee a simple
algorithm for pose inference.}

\changed{ In order to summarize, we now have a notion of the input and
output spaces, the forward and inverse relationships associated with
them, and a few difficulties that can arise in the context of our
example application. The mapping of inputs (cues) to outputs (poses)
is ambiguous (\ie one-to-many) and potentially very complex. The
former precludes the use of discriminative (bottom-up) supervised
learning methods that fit a single (or finite number of) function to
the data to produce a pose given the cue, e.g., most neural networks,
support vector machines, least squares estimation, boosting, etc. The
latter can easily create computational (space and time complexity for
learning) and modeling (overfitting) drawbacks. On the other hand, we
have access to the {\it inverse} map
$\zeta:{\Re}^t\rightarrow{\Re}^c$, which we can exploit in formulating
a solution to the inference/learning problem. However, having this
very accurate generative model alone (top-down) might not be very
useful in terms of finding an algorithm for estimating the body pose
given an input image.}

%% This access to the {\it inverse} map, as well as the
%% one-to-many forward ambiguity are two of the key characteristics
%% of our problem that make it different from other supervised
%% learning problems. The core algorithmic challenges are: 1.)
%% estimating the specialized domains and functions in an optimal way
%% that also takes into account the form of the specialized
%% functions, and 2.) using the knowledge of the inverse function to
%% formulate efficient inference and learning algorithms.

%%RRChange [added full paragraph]
%%SS: I removed this stuff before, because it's redundant.
%% I remove it again.
%% The first sentence is simply a restatement of the paragraph
%% before it.  And the last sentence of paragraph before that.
%% It simply won't fit.
%RR: Can we somehow state that 'these two are the main characteristics of the problem we are trying to solve which make it different from other supervised learning problems'. It will emphasize that we are doing something different. I think it is important.
%RR:
%This paragraph also states clearly what are the fundamental problems (why it is difficult).
%I think this paragraph is a great summary of the whole machine learning part of the paper. It should be kept somehow.
%% SS: OK, OK.  It's your thesis after all :)
%% But as I point out in my email this paragraph is mostly redundant.

\changed{In this paper, we describe a probabilistic, nonlinear
framework for combining generative and discriminative models for
articulated pose estimation. This approach is general, and thus can be
used in other problems with similar structure. The framework employs a
set of $M$ functions $\phi_k:{\Re}^c \rightarrow{\Re}^t$, each
associated to a mixture component in a mixture distribution. Each
function maps certain sub-domains of the input space (cues) onto the
output space (poses). The sub-domains need not be connected regions in
the input or output spaces.}
%More importantly we will see that we do
%not need to explicitly supply a model for these sub-domains.
\changed{
These (bottom-up) functions are estimated automatically from training
data, via a supervised learning procedure. A variant of the
Expectation-Maximization algorithm is used for simultaneous learning
of the specialized domains along with the parameters of these
functions. The learned conditional distribution is then used as an
approximation to the true (generative) distribution given by the
inverse function $\zeta$. This approximation is employed in the same
way a proposal distribution is used to approximate sampling from a
more complex distribution.}

\changed{
The basic concepts are illustrated in Fig.\ \ref{fig:SMAexample}. For
a given input $\mb{x}$, the bottom-up model generate a set of output
hypotheses. We then exploit the generative model (defined by the
inverse mapping $\zeta$) to evaluate the probability of each
hypothesis.}
%RRChange  .... More importantly w
%%SS: is it really more important than other stuff already in this paragraph?
%% Also, I don't know what you mean at all. It's machine learning after all.
%% you have to pick a functional form.  If you feel it's important, can you
%% try to explain it in email?  It's not really clear what you mean here
%% at all.  I would prefer you leave it out.
%%RR:OK

%\psfigurepath{../ICCV01/iccv01/figs}
\begin{figure}[t]
\centerline{
%(a) \psfig{figure=Learning2.ps,width=0.48\textwidth,clip=t}
%~(b) \psfig{figure=Inference2.ps,width=0.46\textwidth,clip=t}
(a) \psfig{figure=map.GIF.eps,width=0.4325\textwidth,clip=t}
~~~(b) \psfig{figure=fb.GIF.eps,width=0.414\textwidth,clip=t}
}
%\begin{figure}[t]
%\vspace*{2.5in}
\label{fig:SMAexample} \mycaptionS{\small
Infering body pose (illustration): (a) Given an input vector $\mb{x}$,
we generate a set of hypotheses. (b) The inverse mapping function
$\zeta$ is employed in evaluating each hypothesis.}
\end{figure}
%%SS: adjusted the size (smaller)

%%% RomerV5: took out following paragraph
\hide{An important advantage of this approach is that it can model
ambiguous, one-to-many mappings that may yield multiple valid output
hypotheses. Unlike other learning approaches that employ a set of
mapping functions (\eg \cite{Friedman91,Hinton98,Jordan94}), this
approach incorporates an inverse mapping $\zeta$ in probabilistic
inference. The framework is evaluated in a computer vision system that
can estimate the articulated pose parameters of a human body or human
hands, given real image silhouettes.  Accuracy and stability are also
tested using synthetic images of human bodies and hands, where ground
truth is known.}


%% %% For related work
%% Several other learning models use a similar concept of fitting
%% surfaces to the observed data by splitting the input space into
%% several regions and approximating simpler functions in these regions
%% (\eg \cite{Jordan94,Hinton98,Friedman91}). However, in these
%% approaches, the inverse map is not incorporated in the estimation
%% algorithm because it is not considered in the problem definition and
%% it is necessary to make the forward model more complex.

%% 1111111111111

\section{Related Work}

In computer vision, recovery of articulated body pose from images is
often formulated as a {\it tracking} problem. Usually, link-joint
models comprised of 2D or 3D geometric primitives are designed
beforehand to roughly match the specific morphology of the target in
question
\cite{Bregler98,Deutscher00,Gavrila95,OrmSidBlaHas01,Rehg95,shimada,Sminchisescu01}.
Mesh models have also been used as an alternative to link-joint models
\cite{heap}. At each frame, these models are fitted to the image to
minimize some cost function that favors the overlap of the model and
associated image regions (or motion). The fitting or cost function are
sometimes implicitly defined using a generative model. Despite their
descriptive power, this family of approaches has a number of critical
drawbacks. Generally, a non-linear optimization problem must be solved
at every frame (sometimes equivalent to inference in a complex
generative model). Careful manual placement of the model on the first
frame in a video sequence is also required.  Moreover, tracking in
subsequent frames tends to be sensitive to errors in initialization
and numerical drift; as a result, these systems cannot recover from
tracking errors in the middle of a sequence.

To address these weaknesses, specialized dynamical models have
been proposed \cite{Isard98J,OrmSidBlaHas01,PavRehMac01}.  These
methods learn a prior distribution over some specific motion
class, such as walking. This prior is used to predict and
hopefully improve the pose estimates in future frames. However,
this strong prior substantially limits the generality of the
motions that can be tracked; a prior for a given class of motions
is generally useless when used for tracking objects undergoing a
different class of motion, e.g., walking vs. dancing.

Other methods for constrained tracking include
\cite{Black98,Black95}, where a subspace of allowable motions is
learned from a set of examples. These examples and the model
(usually linear) are hoped to be sufficient to span the set of
possible motions to be seen during tracking. Thus, pose inference
involves finding a linear projection of the observed data onto the
motion subspace. This subspace approach enforces a strong prior;
as mentioned previously, this limits the generalization of the
model to classes of motions not seen in the training set.
Furthermore, articulated motion is generally non-linear, and
cannot be easily explained as a linear projection.




In our approach we avoid matching image features (e.g., image
regions, points, or articulated models) from frame to frame.
Therefore, we do not refer to our approach as {\it tracking}, per
se. This is in direct contrast with the techniques mentioned
above.  A number of other approaches also depart from the
aforementioned tracking paradigm. We summarize these next.

In \cite{Howe99} a statistical approach is employed in
reconstructing the 3D motions of a human figure. The approach
employs a Gaussian probability model for short human motion
sequences. It is assumed that 2D tracking of the joint positions
in the image is given; therefore, this assumption implicitly
incurs the restrictions found in all tracking approaches.

In \cite{Perona00} dynamic programming is used to calculate the best
global matching of image points to predefined body joints, given a
learned probability density function of the position and velocity of
body features. Although not explicitly mentioned by the authors, the
probability function is defined by a triangulated acyclic graph. Thus,
inference is feasible due to the running intersection property
\cite{Jordan99,Pearl88}.  Still, in this approach, the image points
and model initialization must be provided by hand or through some
other method.

In \cite{Brand99}, the manifold of human body dynamics is modelled
via a hidden Markov model with an entropic prior. Once the states
are inferred from observations, a quadratic cost function is used
to generate a continuous path in configuration space, \ie body
pose space.

In all of the non-tracking approaches mentioned
\cite{Brand99,Howe99,Perona00} models of {\em motion} were
estimated from data. Although the approach presented in this paper
can be used to model dynamics, we argue that when general human
motion dynamics are to be learned, the amount of training data,
model complexity, and computational resources required are
impractical. As a consequence, models with unacceptably large
priors towards specific motions are generated. Although by not
modelling the dynamics we may be ignoring information that could
be used to further constrain the inference process, there are some
benefits. For instance, a model for inferring body pose that does
not consider dynamics provides invariance with respect to speed
(\ie sampling differences) and direction in which motions are
performed. This happens simply because this model treats
configurations as temporally independent of each other. Other
approaches that use a single image include
\cite{Kakadiaris00,Haritaoglu98a,Lee85,Orourke80,Taylor00};
however, most of these methods also require that projected joint
locations be given as input. In our approach this is not
necessary.


Our approach maps visual features to likely body configurations.
Following a machine learning paradigm, stochastic functions that map
visual features to pose parameters are approximated from training
data. A unique aspect of our approach is the combined use of (1) these
mapping functions (defining a discriminative model) with (2) the
inverse mapping function $\zeta$ (defining a generative model). After
multiple poses have been inferred from just the visual cues, $\zeta$
transforms these pose configurations back to the visual cue
(observation) space. In this space, we can then automatically choose
among a set of reconstruction hypotheses. This is a fully
probabilistic inference process. Our approach avoids the need for
manual initialization or tracking; it thereby avoids the consequent
disadvantages of tracking. Remarkably, relatively few computations are
required for inference. We will now formalize and explain our approach
in detail.
%RRChange .This is a ...
%% SS: OK


\renewcommand\arraystretch{0.8} %% SS: This changes separation between table rows

\begin{table}[t] {\small
\begin{tabular}{|ll|}
\hline
number of training examples & $N$\\
training set & $\cl{Z}=\{\mb{z}_1,...,\mb{z}_N\}$ \\
training example (input,output) pair & $\mb{z}_i = (\upsilon_i,\psi_i)$ \\
input (feature) training vector & $\upsilon_i \in \Re^c$ \\
output (pose) training vector & $\psi_i \in \Re^t$\\
\hline
generative and discriminative models probability distributions & $p$,$q$ (respectively)\\
feedback (rendering) function (for generative model)&$\zeta:{\Re}^t\rightarrow{\Re}^c$\\
\hline
number of samples during inference& $S$\\
observation or input image feature & $\mb{x}^*$\\
output (pose) hypothesis ( a sample from $q(\mb{h}|\mb{x}^*)$)& $\mb{h}_k$\\
estimate of most likely output hypothesis & $\hat{\mb{h}}$\\
\hline
Mapping functions (one for each mixture distribution component)& $\Phi = \{\phi_1,\dots,\phi_M\}$\\
discrete set of labels for mixture components&${\cal C}=\{1,\dots,M\}$\\
hidden random variables assigning mixture component to training samples & $\mb{y}=(y_1,\dots,y_N), y_i\in{\cal C}$ \\
prior probability of mixture component $k$ will be used & $\lambda_k = Q(y=k)$\\mapping function parameter vector & $\theta_k$\\
discriminative model parameters (to be learned) & $\theta=(\theta_1,\dots,\theta_M,\lambda)$\\
posterior probability of $k$-th mixture component for $\mb{z}_i$ during EM& $\tilde{Q}(y_i=k)=Q(y_i=k|\psi_i,\upsilon_i,\theta)$ \\
\hline
\end{tabular}}
\mycaptionS{Some mathematical symbols used in the this paper.} \label{tab:symbols}
\end{table}



\section{Probabilistic Models}
\label{sec:ProMod}

\changed{We will now formally define both, the dicriminative and
generative models to be employed. The discriminative model will be
estimated from data and the generative model will be defined by the
inverse function $\zeta$. Intuitively, they represent two views of
the same problem and will be used together in this framework to
provide a solution to infering body pose from a single image.}

\subsection{The Discriminative Model (Botton-Up)}
In our approach, the discriminative model is represented by a set
of mapping functions. These functions are estimated from training data,
via a supervised learning procedure. 

Let $\cl{Z}=\{\mb{z}_1,...,\mb{z}_N\}$ be an observed training set of
input-output pairs $\mb{z}_i = (\upsilon_i,\psi_i)$.  Each $\upsilon_i
\in \Re^c$ is an input (feature) vector, and each $\psi_i \in \Re^t$
is its corresponding output (pose) vector. A summary of mathematical
symbols used in this formulation is provided in Table
\ref{tab:symbols}.

We will approach our forward problem as one of hidden variable density
estimation. We begin by introducing the unobserved random variable
$\mb{y}=(y_1,\dots,y_N)$. In our model any $y_i$ has as its domain the
discrete set $\cl{C}=\{1,\dots,M\}$ of labels for the specialized
mapping functions, and can be thought of as the function number used
to map the $i$-th training pair, $\mb{z}_i$.  Thus $M$ is the number
of specialized mapping functions. Our model uses parameters
$\theta=(\theta_1,\dots,\theta_M,\lambda)$, where $\theta_k$
represents the parameters of the $k$-th mapping function, and
$\lambda=(\lambda_1,\dots,\lambda_M)$, where $\lambda_k$ represents
$Q(y=k)$, the prior probability that the mapping function with label
$k$ will be used to map an input-output pair.

Taking a maximum-likelihood viewpoint, we are interested in
finding the optimal parameter settings for our model; thus, we
seek to maximize the joint log-probability:
%\footnote{This is almost
%identical to taking a MAP estimate viewpoint and considering the
%parameters $\theta$ as random variables with uniform prior in some
%(bounded) interval}
\begin{equation}
\theta^* = \arg\max_\theta \log q(\cl{Z}|\theta).
\end{equation}
Assuming independence of observations given $\theta$, and using
Bayes' rule we obtain:
\begin{eqnarray}
\theta^*&=&\arg\max_\theta \sum_i \log q(\mb{z}_i|\theta)\\
&=&\arg\max_\theta \sum_i \log \sum_k q(\mb{z}_i|y_i=k,\theta)
Q(y_i=k|\theta)\\ \label{eq:OptEq} &=& \arg\max_\theta \sum_i \log
\sum_k q(\psi_i|\upsilon_i,y_i=k,\theta)Q(y_i=k|\theta)
q(\upsilon_i), \label{eq:LogSum}
\end{eqnarray}
where we used the independence assumption
$q(\upsilon|\theta)=q(\upsilon)$. \changed{The term $q(\upsilon_i)$
describes how input patterns occur in the world and for solving
Eq.~\ref{eq:LogSum} it will be approximated by the empirical
distribution implied by our training data. As a consequence, patterns
that occur more often will have a larger effect in the maximization of
Eq.~\ref{eq:LogSum}.}

%Note that because the inputs,
%$\upsilon_i$ do not depend on the model parameters we can ignore
%their distribution when finding the optimal parameter settings.

Due to the sum of terms inside the logarithm of Eq.~\ref{eq:LogSum},
this optimization is generally intractable.  However, a variety of
practical approximate optimization methods exist, for example, methods
that are based on alternating minimizations \cite{Csiszar84}. An
Expectation Maximization (EM) \cite{Dempster77,Neal98} method is
described in Sec.\ \ref{sec:Lea}.



\subsubsection{Choice of a Likelihood Function}

Note that the above formulation is general. In particular, the
form of the probability $q(\psi_i|\upsilon_i,y_i=k,\theta)$ was
not specified.  A key question in instantiating the specialized
mapping architecture is: what form should be used for
$q(\psi|\upsilon,y,\theta)$?  This is the probability that output
$\psi$ was generated by the mapping function $y$, given the input
$\upsilon$ and model parameters $\theta$. In this work we analyze
the following possible cases:
\begin{enumerate}
\item A Gaussian joint distribution of input-output vectors:
\begin{equation}
q(\upsilon,\psi|y,\theta)=\cl{N}((\upsilon,\psi);\mu_{y},\Sigma_{y}),
\end{equation}
\item A Gaussian distribution, whose mean is the output of the
$y$-th mapping function:
\begin{equation}
 q(\psi|\upsilon,y,\theta)
 =\cl{N}(\psi;\phi_{y}(\upsilon,\theta),\Sigma_{y}).
\end{equation}
\end{enumerate}
One way to interpret (2) is that the error in estimating $\psi$,
given we know what mapping function to use, is Gaussian
distributed. %The distribution's mean is the output of the
%specialized function, and its covariance is dependent on the
%specialized function used.
These are the two forms tested in our experiments; however, this formulation is general, and can accept other forms for the
likelihood function.

\subsection{The Generative Model (Top-Down)}

\changed {Our approach involves the use of a generative model of images
(or image features). In the problem of human body pose estimation from
a single image this generative model can be defined in a simple
way. We will assume that an image or image features are generated by
sampling a pose from a prior distribution $p(\mb{h})$ and an image is
then generated using the rendering function $\zeta$ such that:
\beqa
\label{eq:zetaNormalDist}
%\label{eq:gmtd}
p(\mb{x}|\mb{h})={\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta).
\eeqa
It is important to notice that despite the fact that the generative
model can be defined in a simple manner, the function $\zeta$ is
highly complex (non-linear); thus making probabilistic inference
intractable. In stablishing a connection to previous methods, this
inference problem is usually referred to as {\it tracking}. Fitting an
articulated model (\eg composed of solid primitives) is equivalent to
a form of probabilistic inference with several important drawbacks:
this problem requires non-linear optimization with of very complex
function and a good initial guess is difficult to determine
automatically (usually provided by manual articulated model
placement). This form of fitting also have other drawbacks already
explained.}

\changed{
Note that, without loss of generality, $\mb{x}$ could be used to
represent the image or image features also. In our case image features
can be obtained deterministically from the image.}

\section{Learning}
\label{sec:Lea}

\changed{ As explained above, an approximation method must be used in
learning the discriminative model parameters}.  We will employ an
Expectation Maximization (EM) approach. EM provides a general
framework for solving the maximum likelihood parameter estimation
problem in statistical models with hidden variables, like Eq.\
\ref{eq:LogSum}. Since the EM algorithm is well known
\cite{Dempster77,Amari95,Neal98}, we will only provide derivations
specific our formulation.

Note that the unobserved random variables $y_i$ are assumed
independent, given $\mb{z}_i$. Thus, the E-step reduces to computing
the posterior probabilities for each $y_i$ given the model parameters
and observed data. We will denote this posterior
$Q(y_i=k|\psi_i,\upsilon_i,\theta)$ using the shortcut notation
$\tilde{P}^{(t)}(y_i=k)$. We then have:
\begin{equation}
\tilde{P}^{(t)}(y_i=k)=\lambda_{k}p
(\psi_i|\upsilon_i,y_i=k,\theta^{(t-1)})/\sum_{j \in \cl{C}} \lambda_j
q(\psi_i|\upsilon_i,y_i=j,\theta^{(t-1)}).
\end{equation}
Stated differently, this step estimates the responsibility of each
mapping function, $\phi_k$ for each data point,
$\mb{z}_i$. \changed{$\tilde{P}^{(t)}(y_i=k)$ represents the so called
responsibility of function $k$ for data pair $i$. Also recall that
$\lambda_i=Q(y_i)$, the prior probability that function $y_i$ be
used.}

The M-step consists of finding $\theta^{(t)}=\arg\max_\theta
E_{\tilde{P}^{(t)}}[\log q(\cl{Z},\mb{y}|\theta)]$. In both of our
cases we can show that this is equivalent to finding:
\begin{equation}
\label{eq:MDef} \theta^{(t)}=\arg\max_{\theta} \sum_i \sum_{k \in
\cl{C}} \tilde{P}^{(t)}(y_i=k) [\log q(\mb{z}_i|y_i=k,\theta)+ \log
Q(y_i=k|\theta)].
\end{equation}

It is important to mention that this is valid if
$q(\mb{z}_i|\theta)$ depends on $y_i$ and not on $y_j$, for any
$j\neq i$.  Note that for the distributions discussed above, this
is true. We present solutions for the cases described above.

\subsection{Case (1)}

In this case we have:
\begin{equation} q(\upsilon,\psi|y,\theta)=
\cl{N}(\upsilon,\psi;\mu_{y},\Sigma_{y})= \cl{N}(\left[
\begin{array}{c}
\upsilon \\
\psi \\
\end{array}
\right];\left[
\begin{array}{c}
\mu_\upsilon \\
\mu_\psi \\
\end{array}
\right],
\left[
\begin{array}{cc}
\Sigma_{\upsilon\upsilon} \Sigma_{\upsilon\psi}\\
\Sigma_{\upsilon\psi}^\top \Sigma_{\psi\psi} \\
\end{array}
\right] )_{y}.
\end{equation}
In this case, we can show that the parameter learning problem
is reduced to a mixture of Gaussian estimation, for which it is
straightforward to estimate $\theta$ using EM. Moreover, the
Bayesian estimate of $\psi$ given an observed $\upsilon$ is also
Gaussian:
\begin{equation}
q(\psi|\upsilon,y,\theta)=\cl{N}(\psi;\mu_\psi+\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}(\upsilon-\mu_\upsilon),\Sigma_{\psi\psi}-\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}\Sigma_{\upsilon\psi})_{y}.
\end{equation}
Therefore in case (1), each specialized function $\phi_{k}$ is
just the mean of the conditional distribution
\begin{equation}
\phi_k(\upsilon,\theta)=(\mu_\psi+\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}(\upsilon-\mu_\upsilon))_{y=k}.
\label{eq:phiGaussian}
\end{equation}
The confidence of the estimate is given by the covariance
%\begin{equation}
$\Sigma_k =
(\Sigma_{\psi\psi}-\Sigma_{\upsilon\psi}^\top\Sigma_{\upsilon\upsilon}^{-1}\Sigma_{\upsilon\psi})_{y=k}.$
%\label{eq:sigmaGaussian}
%\end{equation}
However, this expression
does not depend on the input, a sometimes undesirable consequence
of the given model. Thus, each function $\phi_k$ is linear in the
input vector from ${\Re}^c$.


\subsection{Case (2)}

In this case we have:
\begin{eqnarray}
\label{eq:lambda_der}
\frac{\partial E}{\partial \lambda_k} &=&  \sum_i \tilde{P}^{(t)}(y_i=k)
\frac{\partial}{\partial \lambda_k} \log Q(y_i=k|\theta)\\
\label{eq:sigma_der} \frac{\partial E}{\partial \Sigma_k} &=&
\sum_i \tilde{P}^{(t)}(y_i=k)
\frac{\partial}{\partial \Sigma_k} \log q(\psi_i|y_i=k,\upsilon_i,\theta_k)\\
\frac{\partial E}{\partial \theta_k}&=&\sum_i
\tilde{P}^{(t)}(y_i=k) [(\frac{\partial}{\partial
\theta_k}\phi_{k}(\upsilon_i,\theta_k))^\top\Sigma_{k}^{-1}
(\psi_i-\phi_{k}(\upsilon_i,\theta_k))], \label{eq:theta_up0}
\end{eqnarray}
where $E$ is the cost function that we would like to maximize in Eq.~\ref{eq:MDef}.
%RRChange Eq above, [there was an error]
%%SS: OK

This gives the following update rules for $\lambda_k$ and
$\Sigma_k$, where Lagrange multipliers were used to incorporate
the constraint that the sum of the $\lambda_k$'s is 1:
\begin{eqnarray}
\label{eq:lambda_up}
\lambda_k^{(t)}&=&\frac{1}{N}\sum_i\tilde{P}^{(t)}(y_i=k)\\
 \label{eq:Sigma_up} \Sigma_k^{(t)}&=&\frac{\sum_i
\tilde{P}^{(t)}(y_i=k) (\psi_i-\phi_{k}(\upsilon_i,\theta_k))
(\psi_i-\phi_{k}(\upsilon_i,\theta_k))^\top}{\sum_i\tilde{P}^{(t)}(y_i=k)}
\end{eqnarray}

To keep the formulation general, we have not yet defined the form
of the specialized functions $\phi_k$. Whether or not we can find
a closed form solution for the update of $\theta_k$ depends on the
form of $\phi_k$. For example if $\phi_k$ is a non-linear
function, we may have to use iterative optimization to find
$\theta_{k}^{(t)}$. If $\phi_k$ yields a quadratic form, then a
closed form update exists.

%\comment{Is there some place in this paper that provides the
%details of the update for the $\phi_k$ you used?  For instance NN
%or other function? As it is, the paper is incomplete; it does not
%give all of the details that someone needs to duplicate your
%system.}

\changed{Regarding our generative model, there is very little learning
involved. In fact, we now exactly the image that will be generated
given a body pose $\mb{h}$ thanks to our function $\zeta$. Thus, in
theory $p(\mb{x}|h)$ has zero variance. However, the prior probability
over poses $p(\mb{h})$ is unknown, but interestingly, as we will see in
the following section, we do not need to specify it in our generative
model.}

\subsection{Stochastic Learning}

The aforementioned optimization equations for the discriminative model
can be used to find a local minimum given the initial parameter
values. In order to improve this process, and avoid some of the local
minima that inevitably arise, we use an annealing schedule on the
$\tilde{Q}^{(t)}$ probabilities during the M-step. In this way, we
redefine:
\begin{equation}
\tilde{Q}^{(t)}(y_i=j) \leftarrow
\frac{e^{\log(\tilde{Q}^{(t)}(y_i=j))/T(t)}}{\sum_{k \in \cl{C}}
e^{\log(\tilde{Q}^{(t)}(y_i=k))/T(t)} }.
\end{equation}

In our experiments, the temperature parameter $T$ decays
exponentially. This step not only helps in avoiding local minima,
but it also creates two desirable effects. It forces
$\tilde{Q}^{(t)}(y_i=j)$ to be binary (either $1$ or $0$) at low
temperatures; as a consequence each point will tend to be mapped
by only one specialized function at the end of optimization.
Moreover, it makes $\tilde{Q}^{(t)}(y_i=k)$ ($k=1,2,...,M$) be
fairly uniform at high temperatures, making the optimization less
dependent on initialization.

%Note that in some cases, there is no closed-form solution for the
%M-step. In practice we have decided to perform two or three
%iterations per M-step. A source of randomness added to the process
%so far described consists of choosing data points randomly and
%uniformly distributed when performing the M-step. These two
%variants of the M-step have been justified in the sense of a
%partial M-step \cite{Neal98}.

%\comment{While the above paragraph makes some sense, it is really
%unclear how you are actually performing the M-step.  It is best
%understood in an example (say for your MLP that would be used in
%the experiments anyway). Perhaps a new subsection is needed here
%to give a summary of the learning algorithm, and the MLP example.}

\section{Inference}

\label{sec:InfSMA} 
%%\changed whole section!!
\changed{
In this section, we refer to probabilistic inference as finding a full
probability distribution for $\mb{h}|\mb{x}^*$ once an observation
$\mb{x}^*$ has been made (\eg our observation could be some image
features).}

\subsection{Inference using the Discriminative Model Alone}
\changed{
Learning the discriminative (botton-up) model yields a set of
specialized functions that map elements from the input space to the
output space. Each of the specialized functions maps different parts
of the input space with different levels of accuracy. This mapping
behavior is described probabilistically by $q$. A valid approach to
inference is to use the discriminative model alone. In order to
understand how this differs from our proposed solution (where we
combine both, generative and discriminative models), we will now show
what inference involves in terms of maximum a posteriori (MAP)
estimation using the discriminative model.}

\changed{
Note that in a general sense inference involves finding a full
probability distribution for $\mb{h}|\mb{x}^*$; the discriminative
model directly provides this expression. In MAP estimation we just
have to maximize it (\ie we want to find the most likely output
hypothesis $\mb{h} \in \Re^t$ for a given observation $\mb{x}^* \in
\Re^c$):
\begin{equation}
\hat{\mb{h}}=\arg\max_\mb{h} q(\mb{h}|\mb{x}^*)=\arg\max_\mb{h} \sum_y
q(\mb{h}|\mb{x}^*,y) Q(y).
\end{equation}
Any further treatment depends on the properties of the probability
distributions involved.}

In both Cases (1) and (2) considered in previous sections, we can
write $q(\mb{h}|\mb{x},y)=
{\cl{N}}(\mb{h};\phi_y(\mb{x}),\Sigma_y)$.
%In Case (2), by
%definition this is exactly the form of the conditional
%distribution. In Case (1), the form of $\phi_y$ and the covariance
%are described in Eqs. \ref{eq:phiGaussian} and
%\ref{eq:sigmaGaussian}.
\changed{Thus, in either case we have that $q(\mb{h}|\mb{x})$ is a mixture of
Gaussians and if we want to find the MAP estimate we need to solve:}
\begin{equation}
\label{eq:StdInf}
\hat{\mb{h}}=\arg\max_\mb{h} \sum_y
{\cl{N}}(\mb{h};\phi_y(\mb{x}^*),\Sigma_y) Q(y). \label{eq:hMix}
\end{equation}

Recall that Eq.~\ref{eq:StdInf} was obtained as a result of performing
(MAP) inference using our learned discriminative model alone, where we
learned $q(\mb{h}|\mb{x})$ as an approximation to the true
distribution defined by $p(\mb{h}|\mb{x})$. Even though we could
simply adopt this as a solution, it should not be surprising that we
could improve upon this solution by using our knowledge of $p$. In the
rest of this section we explain how this could be done and more
importantly, why it is reasonable to think that this should be done at
all.

%However, we have yet to make use of the inverse (rendering) function
%$\zeta:{\Re}^t\rightarrow{\Re}^c$ in our framework.
%RRChange Eq.~\ref{eq:StdInf}...
%% SS: OK

\subsection{Inference Using the Generative Model Alone}
\label{sec:Inf2}

%(recall that the generative model is built from knowledge of the function $\zeta$, the image generating function, thus we use the term 'true' posterior)

Using the generative model, inference involves finding the posterior $p(\mb{h}|\mb{x})$:
\beqa
p(\mb{h}|\mb{x})=\frac{1}{Z_p}{\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})\\
\label{eq:Zp}
Z_p=\int {\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h})d\mb{h}.
\eeqa
There are however at least two difficult obstacles for achieving this:(1) The integral in Eq.~\ref{eq:Zp} cannot be solved easily and moreover, (2) we do not have an accurate expression for $p(\mb{h})$ (we know less about this portion of the generative model $p(\mb{h})$, recall that $p(\mb{x}|\mb{h})$ is assumed to be an accurate image (or image feature) generating distribution).

In MAP estimation, the goal is to find $\hat{\mb{h}}$ such that:

\begin{equation}
\hat{\mb{h}}=\arg\max_\mb{h} p(\mb{h}|\mb{x})=\arg\max_\mb{h} {\cl N}(\mb{x};\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h}),
\end{equation}
needless to say that in the case of body pose estimation, this is in
general, a highly complex non-linear optimization problem (tracking)
as we have seen before.

A key idea in this paper is that both obstacles would become much
simpler if, somehow, we could accurately obtain samples from
$p(\mb{h}|\mb{x})$. Those samples could be used to (1) approximate
this posterior and (2) find the most likely sample for MAP
estimation. However sampling accurately from a given distribution, in
particular $p(\mb{h}|\mb{x})$, is in general an open problem
\cite{McKay98}. In the remainder of this section we will see why this
idea is well motivated and how this can be done.

\subsection{Inference and Importance Sampling. Combining Generative and Discriminative Models}
\label{sec:GenInf}
Sampling can be used to estimate expectations of a given function
$I(z)$ with respect to some probability density $\pi(z)$ that we can
evaluate at any point, but that we cannot sample from. Let us say we
need to calculate the following integral:

\beqa 
\label{eq:int}
{\cl I}=\int \pi(z)I(z)dz, 
\eeqa 
by approximating ${\cl I}$ employing  $R$  samples,
\beqa 
\hat{\cl I}=\frac{1}{R}\sum_{r=1}^R I(z^{(r)}).  
\eeqa

The question is how to appropiately generate the samples to obtain the
best estimate for ${\cl I}$. Sampling can be used in more general tasks.
Usually it is only necessary to be able to evaluate $\pi^*(z)$ which is
equal to $\pi(z)$ to within a multiplicative factor (\ie
$\pi(z)=\pi^*(z)/Z$). Thus, in Eq.~\ref{eq:int} $\pi$ could instead be a
positive function.

Since we cannot usually generate samples accurately, we need to
account for our sampling innacuracies. Importance sampling is a method
that accounts for this as follows. First we come up with a proposal
disribution $\pi'(z)$, which we can also evaluate at least to within a
multiplicative factor but from which it is possible to
sample. Then we sample from $\pi'(z)$, but also correct for the bias
introduced when sampling from the wrong distribution by:

\beqa
\hat{\cl I}=\frac{1}{R}\sum_{r=1}^R \frac{\pi^*(z^{(r)})}{\pi'(z^{(r)})}I(z^{(r)}).
\eeqa

The variance of this estimator is given by:

\beqa
\sigma^2(\hat{\cl I})=\frac{1}{R(R-1)}\sum_{r=1}^R (\frac{\pi(z^{(r)})}{\pi'(z^{(r)})}I(z^{(r)})-\hat{\cl I})^2.
\eeqa

Also, it can be shown that when $R\rightarrow\infty$, $\sqrt{R}(\hat{\cl I}-{\cl I})\sim{\cl N}(0,\sigma^{2}_{\pi'})$, with:

\beqa
\sigma^{2}_{\pi'}=\int (\frac{\pi(z)}{\pi'(z)}I(z)-{\cl I})^2 \pi'(z)dz,
\eeqa
and the expected value of the variance of our estimate is proportional to $\sigma^{2}_{\pi'}$ and inversely proportional to $R$.

Since we would like to maximize the accuracy of our estimator, or
minimize $\sigma^{2}_{\pi'}$, a theorem by Rubinstein \cite{Rubinstein81} is
useful in telling us what is the optimal proposal distribution to use
in order to achieve this. The optimal proposal distribution to
approximate ${\cl I}$ is given by:

\beqa
\label{eq:Just}
\pi'(z)=\pi^*(z)/\int \pi^*(z)dz,
\eeqa
the normalized function $\pi^*(z)$. Since, on purpose, we introduced $\pi^*(z)$ first as a unnormalized distribution originating from  $\pi(z)$, we know that upon normalization we will get back the original distribution $\pi(z)$.

Thus, in finding a posterior distribution for body poses given
observed image features $p(\mb{h}|\mb{x})$ the partition function in
Eq.~\ref{eq:Zp} can be computed as follows:

\beqa
Z_p=\int p(\mb{x}^*,\mb{h})d\mb{h},
\eeqa
using importance sampling we have the following approximation:
\beqa
\label{eq:hatz}
\hat{Z}_p=\frac{1}{R} \sum_{s=1}^S p(\mb{x}^*,\mb{h}\us)/p'(\mb{h}\us)
\eeqa

We have seen than the best distribution to be used for sampling is
$p(\mb{h}|\mb{x}^*)$; unfortunately we cannot sample from it.

The main reason behind using generative and discriminative models
together is to tackle this particular problem of sampling from a good
distribution. We will use the learned distribution $q(\mb{h}|\mb{x})$
(discriminative model) to approximate $p(\mb{h}|\mb{x})$ at
$\mb{x}=\mb{x}^*$. This approximation is in terms of maximum likelihood
estimation and can also be seen as minimizing the KL divergence
between the empirical distribution $p_e$, given by the training data,
and the model distribution $q$: $KL(p_e(\mb{h})|q(\mb{h}))=\int
p_e(\mb{h}) \log [p_e(\mb{h})/q(\mb{h})] d\mb{h}$. Of course, we
assume that the data is composed by representative examples from $p$,
so that the empirical distribution $p_e$ is at all useful.

Eq.~\ref{eq:Just} justfies this choice since it tells us that in order
to find a good approximation for the posterior $p(\mb{h}|\mb{x})$ we
should find a proposal distribution that is similar to it. We may than
ask if we could use this proposal distribution alone then. The reason
why this is not a good idea is that, since we cannot usually find a
proposal distribution that matches the true posterior perfectly, using
this proposal distribution alone is expected to perform worse than
when combined with our accurate generative model. In regions where the
proposal distribution is bad at approximating $p$, we can always
evaluate $p$ and notice the discrepancy.

The distribution $q(\mb{h}|\mb{x})$ is an approximation (\eg in the KL
sense) to $p(\mb{h}|\mb{x})$ in the space of all distributions with
the structure specified by the bottom-up model (a mixture model in our
case). For Gaussian mixture models, it is know that this approximation
can be made as accurate as we wish in the limit of infinite data and
mixture components. Interestingly, we do not need to know explicitly
what $p(\mb{h})$ is in our generative model, finding a good
$q(\mb{h}|\mb{x})$ is still the optimal way to approximate the true
posterior using this method. Thus, even if we use a not so good
assumption for $p(\mb{h})$, still we know what we need to do in order
to achieve a good estimate of the posterior. This is helpful since we
do not really know accurately what $p(\mb{h})$ is (given that we may
not have enough data to estimate it accurately). In the following we
simply use an uniform distribution (in a reasonable finite domain).

To summarize, in order to compute the posterior distribution of body
poses $\mb{h}$, given an observation of image or image features
$\mb{x}^*$, we calculate and estimate for $p(\mb{h}|\mb{x}^*)$ as follows:

\beqa
\hat{p}(\mb{h}|\mb{x}^*)=\frac{1}{\hat{Z}_p}{\cl N}(\mb{x}^*;\zeta(\mb{h}),\Sigma_\zeta)p(\mb{h}),
\eeqa
with $\hat{Z}p$ given by Eq.~\ref{eq:hatz} and using samples from $q(\mb{h}|\mb{x}^*)$.
%ATT H

%% Assuming
%% that the data is composed by representative examples from $p$ we hope
%% to learn a good


%% Thus, this justifies why learning a discriminative distribution
%% $q(\mb{h}|\mb{x})$ is a sensible approach. When learning $q$ from
%% training data, we are trying to approximate $p$ assuming that the data
%% is composed by representative examples from the true distribution.

%% It is known that a mixture of Gaussians can approximate
%% any distribution if enough mixture components are used. Thus, in the
%% limit of infinite data and a large enough mixture our discriminative
%% distribution $q$ could in theory approximate the generative
%% distribution $p$.


%% p(x*)=\int p(x*,h) dh

%% approx with (using IS)

%% p(x*)=1/R \sum p(x*,h)/p'(h)

%% Rubinstein says that the best dist that we can use to sample is p(x*,h) normalized, so that it is a valid pdf, which is p(x*,h)/int p(x*,h) dh=  p(x*,h)/int p(x*) = p(h|x*)!!!!

\subsection{Non-deterministic MAP Estimation: Multiple Samples (MS)}
\label{sec:MS}

We are usually interested in providing likely samples from the
posterior distribution, in particular we might be interested in the most likely $\mb{h}$. This is the idea behind MAP estimation, where we are interested in
finding:

\beqa
\label{eq:ForMAP}
\hat{\mb{h}}=\arg\max_\mb{h} p(\mb{h}|\mb{x}^*)
\eeqa

We know that the discriminative model distribution $q(\mb{h}|\mb{x})$
tries to approximate $p(\mb{h}|\mb{x})$, and therefore it is good at
minimizing the variance of the estimator. Due to this, we will use the
discriminative model distribution to provide samples for MAP
estimation. In MAP estimation, we sample ${\cl H}_{Spl}= \{\mb{h}_s
\}_{s=1...S}$ using the proposal distribution
$q(\mb{h}|\mb{x}^*)$. Given the samples, the problem the becomes a
discrete optimization problem that can be solved easily:

\begin{equation}
\label{eq:ASolM}
{\hat s}=\arg\max_s p(\mb{x}^*|\mb{h}_s)=\arg\min_s
(\mb{x}^*-\zeta(\mb{h}_s))^\top \Sigma_\zeta
(\mb{x}^*-\zeta(\mb{h}_s)),
\end{equation}
by using the Gaussian form of $p(\mb{x}|\mb{h})$ as given in Eq.\
\ref{eq:zetaNormalDist}.

We remark that using the samples ${\cl H}_{Spl}$ as a starting point,
other more sophisticated methods could be employed. For example we
could use Markov chain Monte Carlo (MCMC) sampling \cite{Mackay98} to
search for regions of higher probability. Also, instead of stochastic
methods, we could employ standard gradient descent methods to locally
search for more likely poses $\mb{h}$ (as in tracking). These methods
may be helpful for some distributions but in general have several
drawbacks: (1) They are usually very slow in high dimensions and (2)
given finite time, not very useful/accurate if the posterior
probability is very complex. Keeping this extension in mind, in this
paper we simply use the original samples ${\cl H}_{Spl}$ to search for
a MAP estimate. Empirically speaking, these estimates also provide
very good results in our experiments.

%% Let us assume that we can approximate $\sum_y p(\mb{h}|\mb{x},y)
%% P(y)$ by a set of samples generated according to
%% $p(\mb{h}|\mb{x},y) P(y)$ and a kernel function
%% $K(\mb{h},\mb{h}_s)$, such that $K(\mb{h},\mb{h}_s) \geq 0$ and
%% $\int K(\mb{h},\mb{h}_s) d\mb{h}=1$ for any given $\mb{h}_s$.
%% Given a set of samples ${\cl H}_{Spl}= \{\mb{h}_s \}_{s=1...S}$,
%% we can construct the approximation $\sum_y p(\mb{h}|\mb{x},y) P(y)
%% \approx \frac{1}{S} \sum_{s=1}^S K(\mb{h},\mb{h}_s)$. We now
%% consider two simple forms for the kernel function $K$.

%% If we use a Dirac delta function kernel centered at each sample
%% $K(\mb{h},\mb{h}_s)=\delta(\mb{h}-\mb{h}_s)$, then we have: $
%% \mb{h}^* \approx \arg\max_\mb{h} p(\mb{x}|\mb{h})
%% \frac{1}{S}\sum_{s=1}^S \delta(\mb{h}-\mb{h}_s)$. This can be
%% reduced to an equivalent discrete optimization problem where the
%% goal is to find the most likely sample $s^*$:
%% \begin{equation}
%% \label{eq:ASolM}
%% s^*=\arg\max_s p(\mb{x}|\mb{h}_s)=\arg\min_s
%% (\mb{x}-\zeta(\mb{h}_s))^\top \Sigma_\zeta
%% (\mb{x}-\zeta(\mb{h}_s)),
%% \end{equation}
%% by using the Gaussian form of $p(\mb{x}|\mb{h})$ as given in Eq.\
%% \ref{eq:zetaNormalDist}.

%% If instead we use Gaussian kernels centered at each sample
%% $K(\mb{h},\mb{h}_s)={\cl N}(\mb{h};\mb{h}_s,\Sigma_{Spl})$, then
%% we have: $\mb{h}^* \approx \arg\max_\mb{h}  p(\mb{x}|\mb{h})
%% \frac{1}{S}\sum_{s=1}^S {\cl N}(\mb{h};\mb{h}_s,\Sigma_{Spl})$.
%% This approximation is harder to use in practice. Unlike the Dirac
%% delta kernel approximation, the Gaussian approximation cannot be
%% reduced to an equivalent discrete optimization since there is no
%% guarantee that the optimal $\mb{h}$ for this form is among the
%% samples in general.
\subsection{Deterministic MAP Estimation: Mean Output (MO)}
\label{sec:MO}

In certain applications, it might be advantageous to count with a very
fast method for computing MAP estimates. Two examples are: when
working with multiple articulated bodies or in dynamic settings where
it is necessary to provide estimates at a high rate. Even though the
time complexity of MS scales linearly with the number of samples, this
might not be fast enough. Motivated by speed constraints, here we
propose a very fast MAP estimation algorithm that still performs well
in experiments. Unlike MS, this algorithm is deterministic.

The structure of the problem, as well as the form of the
discriminative distribution components (\ie conditioned on the mixture
label) $q(\mb{h}|\mb{x},y)$ employed (Gaussian), make it possible to
construct this deterministic approximation to Eq.\ \ref{eq:ForMAP}. The
basic intuition is straightforward. For a given $\mb{x}$, we {\em ask}
each specialized function $\phi_k$ to give its most likely estimate
for $\mb{h}$.  We then evaluate the probability of each function's
estimate via the generative model distribution
$p(\mb{x}|\mb{h})$. This approximation is good in practice, as will be
demonstrated in the experiments.

To justify this deterministic approximation, we note that due to
concavity properties, the probability of the mean is maximal in a
Gaussian distribution; \ie it is the most-likely value.  Formally, in
both Case (1) and Case (2) described earlier,
$p(E[\mb{h}|\mb{x},y,\theta])\geq p(\mb{h}'|\mb{x},y,\theta)$, for any
$\mb{h}'$. Consider again the set of samples ${\cl H}_{Spl}=
\{\mb{h}_s \}_{s=1...S}$ generated in the MS approximation. We can
build a set of samples ${\cl H}_{\phi}=\{\mb{h}_{k}^{\phi}
\}_{k=1...M}$ that has the property:
\begin{equation}
\forall y, \max_k p(\mb{h}_{k}^{\phi}|\mb{x},y) \geq \max_s
p(\mb{h}_{s}|\mb{x},y)
\end{equation}
simply by setting $\mb{h}_{k}^{\phi}=\phi_k(\mb{x},\theta)$.

This insight leads to a deterministic approximation for inference, the
{\it Mean Output} solution (MO). This approximate solution relies on
the observation that by considering the means $\phi_s(\mb{x})$, we
would be considering the most likely output of each specialized
function (\ie each mixture component in the discriminative model),
given the input. Obviously we expect the discriminative model provides
a good approximation of our generative model posterior distribution as
discussed above. Also, the smaller the overlap among the distributions
associated with each specialized function, the better the accuracy of
this approximation.

In MO approximate inference, the expression to be minimized is the
same as that used in Eq.\ \ref{eq:ASolM}, except for the use of
the $M$ means instead of the $S$ samples:
\begin{equation}
k^*=\arg\max_{k \in {\cl C}}  p(\mb{x}|\mb{h}_{k}^{\phi})
=\arg\min_{k \in {\cl C}} (\mb{x}-\zeta(\mb{h}_{k}^{\phi}))^\top
\Sigma_\zeta (\mb{x}-\zeta(\mb{h}_{k}^{\phi})). \label{eq:ASolU}
\end{equation}
This generally requires substantially less computation than would be
required in the MS approach. \changed{The main motivation behind MO is
to increase computational savings.}

\section{Example Application: Articulated Pose from Visual Features}
\label{sec:Apps}

The formulation presented in this paper is rather general, and could
be applied in a number of supervised learning problems for which the
output-to-input (feedback) map is relatively easy to compute; \changed{
thus allowing us to specify an accurate generative model}. To
demonstrate and test our framework, we have developed a system that
uses the our approach to infer articulated pose from low-level visual
features. In particular, we focussed on pose estimation of the human
hand and body from an image silhouette. In this class of computer
vision applications, ground truth datasets for use in training can be
obtained via motion capture gloves or body suits, and computer
graphics rendering can be used to generate the input-output pairs used
in supervised learning.  We will now give details of this
demonstration system.

\subsection{3D Hand Pose Estimation}
\label{sec:AppsHand}

 In this application, our goal is to recover
detailed 3D hand pose from silhouette features computed from a
single color image. Hand pose is defined in terms of the hand
joint angles. In general, we are also interested in global
orientation of the hand. We explore two applications: estimation
of the internal joint angles only, and later, estimation of both
internal joint angles and global orientation of the hand.

\subsubsection{Hand Model}

We utilize the hand model provided in the VirtualHand programming
library \cite{virtual_hand}. The model parameters are 22 joint
angles. For the index, middle, ring and pinky finger, there is an
angle for each of the distal, proximal and metacarpophalangeal
joints. For the thumb, there is an inner joint angle, an outer
joint angle and two angles for the trapeziometacarpal joint. There
are also abduction angles between the following pairs of
successive fingers: index/middle, middle/ring and ring/pinky.
Finally, there is an angle for the palm arch, an angle measuring
wrist flexion and an angle measuring the wrist bending towards the
pinky finger. However, because the former two wrist angles also
encode global orientation, we decided not to model them in our
application. Hence, ignoring these two angles, our model has 20
DOF for the internal hand configuration.

All of these 20 angles are relative to two global orientation
angles. These two angles will encode the camera viewpoint (or
alternatively hand 3D rotation). Imagine a sphere surrounding the
hand model, \ie a fixed hand center point is at the center of the
sphere. For ease of reference, we will employ the widely used
latitude and longitude notions. The first angle $\beta_1$
represents the latitude from which we are looking at the hand, the
second angle $\beta_2$ represents the longitude. We have defined
$\beta_1 \in [0,\pi]$, with zero and $\pi$ being the {\it poles}
of the sphere and $\beta_2 \in [0,2\pi)$. Thus, in summary our
full hand model has 22 DOF.


\psfigurepath{./figs/}
\begin{figure}[t]
\centerline{\small
\psfig{figure=AllViewsHandBin.ps,width=0.6\textwidth} }
\mycaptionS{\small Example of the 86 silhouettes obtained via
computer graphics rendering for a given a 3D hand pose.  Views are
distributed approximately uniformly over the view
sphere.}\label{fig:HAllViews}
\end{figure}

\subsubsection{3D Hand Motion Datasets}
\label{sec:3DHDS}

Using a CyberGlove, we collected approximately 9,000 examples of
3D hand poses.  This data included hand configurations from
American Sign Language (ASL) and other configurations informally
performed by several members of our research group. Using computer
graphics and an artificial hand model, we then rendered each
captured hand pose from multiple viewpoints on the view sphere. In
our implementation, we defined a set of 86 viewpoint angle pairs
$(\beta_1,\beta_2)$ so that the sphere surface is sampled
approximately uniformly.  Thus we obtained a full dataset of
$9,000 \times 86$ views.  Each view has an associated binary image
mask (silhouette), and a 22 DOF pose vector. Fig.\
\ref{fig:HAllViews} shows the 86 viewpoints used in the dataset.


From these silhouettes, we extract the visual features that will
be used for further processing. In our implementation, we used two
classes of features (these features are not used together): Hu
moments and Alt moments. Alt moments \cite{Alt62} are translation
and scale invariant, but not rotation invariant. Hu moments
\cite{Hu62} are invariant to translation and scaling, but also
invariant to rotation in the image plane. These moment features
were used in our implementation because they are relatively easy
to compute, and they provide invariants that are appropriate for
our demonstration application. However, our general formulation can be used with other visual feature representations
if desired. Detailed examination of the feature selection problem
is outside the scope of this paper, and remains a topic for future
research.

The above process yields a set of input-output (cue-pose) pairs to
be used in our experiments. We define two experimental datasets:
\begin{enumerate}
\item {\em Hand-Single-View:} In this dataset, the hand is viewed
from only one viewpoint ($\beta_1=\pi/2$, $\beta_2=0$), generally
making the palm of the hand visible.  Silhouette features are
computed using Alt moments.  This yields approximately 9,000
input-output pairs.

\item {\em Hand-All-Views:} In this dataset, the hand is viewed
from all 86 viewpoints. Silhouette features are computed using Hu
moments. This yields approximately 750,000 input-output pairs.
\end{enumerate}

\subsubsection{Hand Detection and Segmentation}
\label{sec:segment}

For live video input, we will use video sequences collected with a
color digital camera. It will be assumed that these sequences have a
static background and only one person is present. In this
implementation, we are not considering hand occlusion analysis, which
by itself is a difficult task. Our system tracks both hands of the
user automatically using a skin color tracker \cite{sigal_2000,RosalesICCV01}.
%RRChange .... and the person is facing towards the camera  [is not needed]
%%SS: OK

\subsection{2D Human Body Pose Estimation}
\label{sec:2DBP}

In this application, our goal is to recover the articulated pose
of a human body observed in a single image. The methodology
followed is very similar to that used in the estimation of hand
pose. However, instead of joint angles, body pose will be
specified in terms of marker positions at a predetermined set of
joints.  We will estimate the 2D positions of these body
markers in the image plane, given visual features as input.

\subsubsection{Human Body Model}

The human body model is defined in terms of 20 3D marker positions
(60 DOF). The 20 markers are distributed as follows: three markers
for the head, three markers for the hip/back bone articulation,
plus one marker for each shoulder, elbow, wrist, hand, knee,
ankle, and foot. For computer graphics rendering, the body model
is composed of cylinders of equal width. The cylinders connect the
markers to form the standard human body structure. The thorax is
modeled using a wider cylinder. Because we are only interested in
the shape of the projected model, we do not include texture or
illumination in our rendering.

\subsubsection{Human Body Pose Dataset}

Human body motion capture data was obtained from several sources:
http://www.biovision.com, Matt Brand's dataset \cite{Brand99}, and
several demo sequences in the software package {\em Character
Studio}.  In total there are 32 captured sequences that depict
variations of different activities: dancing, walking, kicking,
waving, throwing, jumping, signaling, crouching down. The total
number of frames collected is approximately 7,000, mostly at 30
frames/second. Using computer graphics and our artificial body
model, we then rendered each frame from 16 equally-spaced
viewpoints on the equator of the view sphere centered at the hip
of the body model. For each view, we also used the camera model to
obtain the 2D marker positions in the image plane. Thus we
obtained a full dataset of approximately $7,000 \times 16$ views.
Each view has an associated binary image mask (silhouette), and a
40 DOF projected marker vector.

From the silhouettes, we extract the visual features that will be
used as input.  For this application, we have chosen
Alt moments \cite{Alt62} as our visual features, mainly due to
their ease of computation and invariance to translation and
scaling.

The above process yields a set of input-output (cue-pose) pairs to
be used in our experiments.  In this case, the cues are the Alt
moments for a particular view, and the pose is encoded in terms of
the projected locations of the body markers in the image plane (40
DOF). We call this dataset the {\em Body-All-Views} dataset.

\subsubsection{Detection and Segmentation}

\label{sec:BodyDet}

For live video input, we use sequences collected with a color
digital camera. It is assumed that these sequences have a static
background, only one person is present, and the person is
fully-visible. We use a simple and widely-used human body
segmentation scheme \cite{Hogg83,Wren96}. The technique
employs statistical learning to acquire a model of the background
appearance, where each pixel's color (luminance) is represented by
a Gaussian distribution. Segmentation is then approached in a
maximum-likelihood fashion, where each pixel is classified as
belonging to one of two classes: the background or the foreground
(human body).

\subsection{Common Implementation Details}

We know briefly discuss implementation details common to both
applications.

\subsubsection{Mapping Functions}

In Sec.\ \ref{sec:ProMod}, it was not specified what class of
(deterministic) mapping functions ${\phi_k}$ were to be used. Our
framework is practically independent of this choice. However, from
Eq.\ \ref{eq:theta_up0} we can notice that there are clear advantages
in the M-step if these functions are differentiable with respect to
their parameters. In the case of quadratic or linear functions, the
M-step can be performed exactly in one step. However, the flexibility
of these functions is limited.  In our implementation each mapping
function is a multi-layer perceptron with one hidden layer (MLP). For
the non-linear one hidden layer perceptrons, there does not exist a
closed-form solution for Eq.\ \ref{eq:theta_up0}. In our
implementation, we used four to five iterations of the conjugate
gradient method per M-step.

\hide{
% this paragraph isn't needed
For the non-linear one hidden layer perceptrons, there does not
exist a closed form solution for Eq.~\ref{eq:theta_up0}. We use
the conjugate gradient (CG) optimization method, for performing
the M-step. If $\phi_k$ is a one hidden layer perceptron with
parameters $\theta_k$, we have:

%% This is simply a restatement of Eq 14. with nothing new.  So it should not be included.

\beqa \frac{\partial E}{\partial \theta_k}&=&\sum_n
\tilde{P}(y_i)[(\frac{\partial}{\partial
\theta_k}\phi_{k}(\upsilon_n,\theta_k))^\top\Sigma_{k}^{-1}
(\psi_n-\phi_{k}(\upsilon_n,\theta_k))], \eeqa

% I thought about adding this, but it seems non-essential at this point (sorry).

Since in a one hidden layer perceptron the parameters are a set of
real-valued weights, let us explicitly denote the parameters of
$\phi_k$ as $\theta^k=\{w_{jil}^k\}$, where $w_{jil}^k$ denote the
synaptic weight from node $i$ to node $j$ in layer $l$, for the
specialized function $k$ \cite{HaykinBook96}. Also, denote
$\varphi$ the non-linear function relating input $s$ to output
activity $r$ in the hidden layer nodes, \ie
$r_i^{(2)}=\varphi(s_i^{(2)})$, the output nodes are assumed
linear, \ie $r_i^{(3)}=\alpha s_i^{(3)}$ \footnote{In both cases
the biases are embedded in the function definitions.}. With this
re-parameterization, we can then show that the gradient for
specialized function $k$ is:


If $l=2$ ($w$ connects the hidden with the output layer): \beqa
\frac{\partial}{\partial
w_{jil}^k}\phi_{k}(\upsilon,\theta_k)=-r_i^{(3)}\varphi'(s_j^{(3)})
\eeqa

If $l=1$ ($w$ connects the input to the hidden layer): \beqa
\frac{\partial}{\partial
w_{jil}^k}\phi_{k}(\upsilon,\theta_k)=-r_i^{(3)}\varphi'(s_j^{(3)})\sum_q
\varphi'(s_q^{(2)}) w_{qj}^{(2)} , \eeqa

with $s_i^l$ the input in node $i$ in layer $l$ and $r_i$ its
corresponding output activity.}

\subsubsection{Generative Model Details: Feedback Functions}

In the previous sections we made reference to the inverse or feedback
function denoted $\zeta$. There are at least two ways to define this
function. On the one hand, $\zeta$ could be a computer graphics
rendering function. On the other hand, we could estimate an
approximate $\hat{\zeta}$ given a set of output-input training
examples. In our implementation, we experimented with both
approaches. For $\zeta$, we used computer graphics renderings of our
hand and body models obtained via OpenGL. For $\hat{\zeta}$, we used a
one hidden-layer perceptron, with twenty hidden nodes. In our
experience, this provides an adequate and efficient approximation.
%RRChange .... with twenty hidden [replaced] with five hidden...
%%SS: OK

The approximate feedback function is useful primarily because it is
faster to compute than a graphical rendering followed by visual
feature computation. \changed{The key issue to keep in mind is that the
feedback mapping is assumed to be simple (one-to-one or even
many-to-one) or that it has a known form, otherwise if we assume too
simple functional forms, we would only introduce more estimation
errors. Of course, this is just a practical issue}. If the feedback
mapping is too complex to approximate easily, we could always rely on
the available feedback function $\zeta$.
%%RRChange [Added] many-to-one
%%SS: OK

\subsubsection{Computational Performance}

For an Athlon 1400 PC with 2GB memory, running unoptimized Matlab
6.0 code, it takes approximately five hours to train a model with
10 dimensions (input) and 10 dimensions (output), using 4500
patterns, and 40 single hidden layer perceptrons with five hidden
nodes each.

Using the same setting, the system can infer body poses at
approximately 11 frames per second, using the Mean Output (MO)
algorithm. \changed{This approach's} related computations take approximately 70\% of this
time. This time includes OpenGL-based rendering of body poses in
$\zeta$. The rest is spent in segmentation and feature
calculations. The Multiple Sample (MS) algorithm takes time
proportional to the number of samples used. Of course, segmentation
and feature computation for the segmented image is done only once. We
noticed that for our implementation, if we use the approximate
feedback function, $\hat\zeta$, the rendering time is reduced to
approximately one-fourth.
%%RRChange, [I looked at my notes and fixed this]
%%SS: OK

\subsubsection{Early Stopping During Training}

During model training, we used cross-validation for early stopping and
to avoid over-fitting as follows:
%\footnote{The Minimum Description
%Length (MDL) principle \cite{Rissanen86} was also used to avoid
%overfitting as explained in the experiments}:
%%RRChange [added footnote]
%%SS: Removed: I must insist (sorry).
%% This is redundant with text elsewhere and unrelated to early stopping.

\begin{itemize}
\item {\em Training data:} Stop if the log-likelihood changes less
than 0.5\% averaged over the last ten iterations.
\item {\em Held out data:} Stop if the held out data
log-likelihood average change is negative over the last ten
iterations. Held out data was chosen in the same way as the
training and test data.
\item {\em Number of iterations:} Stop if a maximum of 200
iterations is reached.
\end{itemize}

\setlength{\tabcolsep}{1pt} %%SS: This changes separation between table columns
\renewcommand\arraystretch{0.25} %% SS: This changes separation between table rows

\section{Experimental Results}
\label{sec:Exp}

We now present experimental results obtained using our approach in
estimating the pose of the human hand and body. For many additional
performance experiments not included due to space limitations, the
reader is referred to \cite{RosalesPhDThesis} and for several MO
estimation videos to
http://www.psi.toronto.edu/$\sim$romer/SMAHandVideos.htm. The SMA
application independent Matlab code can be found at
http://www.psi.toronto.edu/$\sim$romer/SMACode.htm.

\subsection{Hand Pose Estimation Given a Fixed Camera Viewpoint}
\label{sec:FixCam}

In our first experiments, our approach is tested in the task of
recovering 3D human hand pose given a fixed camera viewpoint: a
view towards the palm of the hand. For training, we used the {\it
Hand-Single-View} dataset, which contains a total of approximately
9,000 examples. Of these, 3,000 were used for training and the
rest for testing.  All experiments were performed on a test set
that shared no common poses with the training set. The
input-output pairs were then defined as follows.  The input
consisted of 10 Alt moments computed from the silhouette of the
hand, as described in Sec.\ \ref{sec:AppsHand}. The output
consisted of 20 joint angles of a human hand linearly encoded by
nine values using Principal Component Analysis (PCA).

In this experiment, the number of mixture components for the
discriminative model (\ie the number of specialized functions) was set
to 20. This number was found to be optimal in the sense of the Minimum
Description Length (MDL) principle \cite{Rissanen86}; an exhaustive
search is impractical, so we find this number via approximate
search. Each mapping function (for each of the Gaussians in the
mixture) was a one hidden layer, feed-forward network (multi-layer
perceptron) with seven hidden neurons.

\subsubsection{Quantitative Results}

To measure the accuracy of the hand pose reconstruction, we
randomly selected approximately 4,000 frames not included in the
training set. This test set has the advantage that ground truth is
available. Using the estimated feedback function $\hat\zeta$ in
the Mean Output approach (MO), the average $L_2$ error between
reconstruction and ground-truth was $0.1863$ radians
(approximately $10^o$), with variance $0.0185$. These error
estimates are averaged over joint angles.  We ran this experiment
with the same test set, but instead used the computer graphics
rendering feedback function $\zeta$. When using $\zeta$, similar
accuracy was obtained. The average $L_2$ error between
reconstruction and ground-truth in this case was $0.241$ radians,
with variance $0.0312$. In \cite{RosalesPhDThesis}, we explain in
detail the reasons for this relatively small difference in
performance.
%SChange previous paragraph

\psfigurepath{./figs/H90}
\begin{figure*}[ht]
\centerline{\begin{tabular}{rcccccccccc}
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.02096.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03973.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03275.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01965.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01265.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.00655.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01729.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02576.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01877.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01091.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.02096.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03973.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03275.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01965.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01265.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.00655.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01729.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02576.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01877.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01091.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.03942.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03569.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01572.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02273.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02575.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01681.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01659.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02401.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02751.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02183.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.03942.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03569.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01572.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02273.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02575.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01681.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01659.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02401.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02751.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02183.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.02663.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03842.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02162.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02353.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02369.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.04272.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.04048.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03872.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03856.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03840.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.02663.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03842.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02162.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02353.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02369.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.04272.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.04048.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03872.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03856.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03840.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
GT &
\psfig{figure=HandData90_R10i.TestGT.mat.03296.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02928.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02896.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02784.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02672.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.01825.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02576.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02449.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.02001.eps,width=0.63in,clip=t} &
\psfig{figure=HandData90_R10i.TestGT.mat.03328.eps,width=0.63in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V20-2_0_R10i.03296.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02928.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02896.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02784.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02672.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.01825.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02576.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02449.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.02001.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0_R10i.03328.eps,width=0.63in,clip=t} \\
\end{tabular}
} \mycaption{Estimated hand poses using Mean Output (MO) algorithm
and $\hat{\zeta}$}{\small  \CapTestI}{} \label{fig:H90Res}
\end{figure*}



Fig.\ \ref{fig:H90Res} shows example reconstructions obtained via
the MO approach. In many cases, the reconstruction is close to the
ground truth. In other cases, the silhouette is highly-ambiguous,
and the reconstruction does not match ground truth. A good example
is shown in image pair number 34 (the last row-pair, fourth column),
where the camera's image plane is perpendicular with the axis of
the pinky finger. Note that the estimated hand pose disagrees with
the ground-truth in the several joint angles associated with this
finger. Similar effects with other joint angles can be seen in
example pairs 8, 26, 37, etc.

Ambiguous configurations are indeed very common with a binary
image representation.  Note that in other ambiguous cases shown in
Fig.\ \ref{fig:H90Res} reconstruction is closer to ground truth,
\eg pairs 29, 30, etc. Possible reasons for this agreement are
diverse:
\begin{enumerate}
\item The input is not really ambiguous (probabilistically speaking)
in the observation space. The other possible outputs (geometrically
speaking) associated with this input may be very unlikely given the
training set. This depends on the underlying structure of the
configuration manifold. One of the main goals of a learning algorithm
is to find this structure. Indeed these results show that our
algorithm is finding this structure, since in most cases, MO finds a
valid sample from the manifold.
%RRChange [last sentence]
%%SS: OK
\item \changed{ The learned discriminative model was very accurate at
modeling the given input using a single mixture component}(\ie few
mapping functions were trained to map this input, therefore the rest
of the functions produced irrelevant (bad) outputs).
\item By chance, among many very similarly probable solutions, the
{\it right} one was chosen. Of course, even with the help of chance in
this case, the discriminative model needed to be accurate enough at
approximating the true posterior so that samples were relevant at all.
%mapping functions needed to provide the
%right mapping for the given input $\mb{x}$.
\end{enumerate}

The accuracy of the Multiple Samples (MS) inference approach was
tested in similar experiments with approximately $4,000$ randomly
chosen test examples not included in the training set. When the
estimated feedback function $\hat\zeta$ was used, the mean $L_2$
error of the most likely sample to the ground-truth was $0.2202$
radians with variance $0.0228$. The mean error and variance from
the best 20 samples was $0.308$ and $0.3023$ respectively. When we
performed the same experiment, but instead used the computer
graphics feedback function $\zeta$, we observed very small
quantitative differences. We obtained a mean error of $0.2628$
radians with variance $0.0242$ for the most likely sample. The
mean error of the best 20 samples was $0.3128$ radians with
variance $0.3000$.

These experiments confirmed that MO inference seems to provide a
reasonable approximation, at least for this dataset. Recall from Sec.\
\ref{sec:MO} that MO inference was based on the premise that the
most-likely reconstruction given by each \changed{discriminative
mixture component} (defined by the specialized function) provides a
good approximation to the best solution given by the full probability
distribution.
%%RChange previous paragraph

\subsubsection{Experiments with Real Images}
\label{sec:H90RealImgs}

We now test our approach using uncalibrated video sequences, where
the camera is pointing towards the palm of a person's hand. On
average, the hand occupied an area of approximately $200 \times
200$ pixels. Segmentation was obtained as described in Sec.\
\ref{sec:segment}.

In the first experiment, we use the MO approach to obtain a single
{\it best} estimate for each segmented hand. Estimates for 40
frames, taken 0.9 seconds apart, are shown in Fig.\
\ref{fig:Real90TestIE}. Visually we can notice that in most cases
the estimate is a plausible explanation of the segmented
silhouette.  However, there are also a few inaccurate
reconstructions.
%as seen in the fourth row, columns 1 and 5.
%% SS: (figure changed, so these numbers are incorrect)

In general, it is expected that the model cannot perform well
in all configurations (this is true for almost any machine
learning model) due to the following reasons:
\begin{enumerate}
\item The proposal distribution $q(\mb{h}|\mb{x})$ does not resemble
the true posterior distribution $p(\mb{h}|\mb{x})$: learning is the
result of optimizing an {\it expected} or average error.
\item The real hand and synthetic hand model features are similar
but not the same.  Anthropometric differences can influence
inference accuracy.
\item Even the best model could fail in some configurations.
Information theory tells us that this is always the case except
when the {\it information} in the features is equal to the entropy
of the body pose configurations; in other words, when features
tell us everything needed about the configuration. Otherwise,
there might be multiple explanations for a given visual feature
vector.
\end{enumerate}

In order to test the ability of the system to provide these multiple
explanations, we tested the Multiple Samples (MS) approach. Fig.\
\ref{fig:Real90TestII} shows the estimates found using MS. These
estimates can be interpreted as possible hypotheses of hand
configurations given the silhouettes. \changed{Note tha MS tends to
bias the hypotheses towards samples from the distribution
$q(\mb{h}|\mb{x}^*)$, but we can account for this when building a full
probability distribution, as explained in Sec.~\ref{sec:GenInf}}

\psfigurepath{./figs/RealResultsHand2}
\begin{figure*}[ht]
\centerline{\small \begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Subsampled1_F71_770.rle.00001.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00006.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00011.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00016.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00021.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00026.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00031.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00036.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00041.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00046.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00001.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00006.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00011.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00016.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00021.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00026.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00031.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00036.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00041.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00046.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00051.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00056.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00061.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00066.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00071.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00076.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00081.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00086.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00091.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00096.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00051.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00056.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00061.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00066.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00071.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00076.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00081.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00086.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00091.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00096.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00101.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00106.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00111.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00116.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00121.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00126.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00131.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00136.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00141.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00146.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00101.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00106.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00111.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00116.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00121.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00126.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00131.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00136.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00141.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00146.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F71_770.rle.00151.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00156.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00161.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00166.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00171.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00176.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00181.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00186.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00191.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F71_770.rle.00196.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00151.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00156.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00161.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00166.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00171.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00176.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00181.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00186.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00191.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V20-2_0R10iE_Fts_Subsampled1_F71_770.mat.00196.eps,width=0.63in,clip=t}\\
\end{tabular}
} \mycaption{Hand pose estimates in real video sequences (RV)
using the Mean Output algorithm (MO).}{\small \CapRTestIE}{}
\label{fig:Real90TestIE}
\end{figure*}

\psfigurepath{./figs/RealResultsHand2}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{ccccccc}
RV & MO& S1& S2& S3 & S4 & S12 \\
%
\psfig{figure=Subsampled1_F71_770.rle.00010.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00010.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00010_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00019.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00019.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00019_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00028.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00028.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00028_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00037.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00037.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00037_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00046.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00046.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00046_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00055.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00055.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00055_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00064.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00064.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00064_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00073.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00073.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00073_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00082.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00082.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00082_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F71_770.rle.00091.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_0_R10i_Fts_Subsampled1_F71_770.mat.00091.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V20-2_010iMS_Fts_Subsampled1_F71_770.mat.00091_012.eps,width=0.63in,clip=t}\\
\end{tabular}}
 \mycaption{Hand pose estimates in real sequences using multiple
sampling algorithm}{\small \CapRTestII}{} \label{fig:Real90TestII}
\end{figure*}


\subsection{3D Hand Pose Reconstruction Given an Unrestricted Camera Viewpoint}
\label{sec:3DHan}


Our approach is now tested in the task of recovering 3D human hand pose
from an unknown camera viewpoint. For training, we used the {\it
Hand-All-Views} dataset, which contains a total of approximately
750,000 examples. Of these, 18,000 were used for training and the
rest for testing. The input-output pairs were then defined as
follows.  The input consisted of seven Hu moments computed from
the silhouette of the hand, as described in Sec.\
\ref{sec:AppsHand}. The output consisted of 20 internal joint
angles of the hand and two orientation angles. This 22 DOF
representation was linearly encoded by nine values using PCA.

The number of \changed{mixture components} (specialized functions) was
set to 45.  This number was determined via the MDL criterion, as
before. Each specialized function was a one hidden layer, feed-forward
network with seven hidden nodes.

\subsubsection{Quantitative Results}

We computed the $L_2$ error in estimating hand pose, and
quantitatively compared this measure across views. Fig.\
\ref{fig:HandPerf1SampleS} shows the error of the most likely estimate
found using the MO approach. From the graphs we see that views towards
the palm of the hand ($90^\circ$) are slightly easier to reconstruct
on average, while the variance seems similar across views. As
expected, the average error is higher than that obtained for the fixed
view hand pose reconstruction experiments.  The differences in
performance obtained from using $\zeta$ or $\hat\zeta$ are relatively
small. However, it seems that for unrestricted hand views it is
advantageous to use the computer graphics feedback function
$\zeta$. This is probably because estimating this inverse mapping
$\hat\zeta$ \changed{(to define the generative model)} over
unrestricted viewpoint is more complicated than for only frontal hand
views (and the mapping is likely to be more complex also).
%%RChange previous paragraph
%SChange Previous paragraph

Fig.\ \ref{fig:HandPerf1SampleM} shows the results using the MS
approach. Fig.\ \ref{fig:HandPerf1SampleM}(a) shows the error
associated with the best sample. This error behaves very similarly to
the MO error. Fig.\ \ref{fig:HandPerf1SampleM}(b) shows the average
error computed using the best 20 samples. This error is higher than
that of the best sample. Note that this is not an obvious result given
that the best sample is determined without having knowledge of
ground-truth. In fact, if the average error of the best 20 samples
were lower than that of the best sample, then we could infer that our
algorithm is very inaccurate at determining what samples are
better. Thus this result positively endorses our MS algorithm.
%%RRChange  Thus this result positively endorses our MS algorithm.
%%SS: OK

For comparison, we used the ground-truth to select the best
sample, based on minimum RMSE.  In other words, we have an oracle
that picks the sample closest to the ground-truth. The resulting
performance graph is shown in Fig.\ \ref{fig:HandPerf1SampleM}(c).
This represents the lower-bound on the reconstruction error using
the learned forward model. The graph is interesting in the sense
that it separates the errors from the forward and feedback models.
The feedback model produces a RMSE $< 0.35$ across views. This is
roughly half the total RMSE error produced by our method overall.


\psfigurepath{./figs}
\begin{figure}[t]
\centerline{(a)
\psfig{figure=GG_Res_softH2V31-7_7_GR7i.mat.EType_0.eps,width=3in,clip=t}
~~~(b)
\psfig{figure=GG_Res_softH2V31-7_7_R7iE.mat.EType_0.eps,width=3in,clip=t}
} \mycaption{Unrestricted view model performance using Mean Output
(MO) and $\hat{\zeta}$}{\small Mean Output (MO) inference
performance for unrestricted view tests at given viewpoint
latitudes (averaging over longitude). The feedback function is (a)
the estimated $\hat\zeta$ (b) the computer graphics rendering
$\zeta$. A frontal view of the hand palm is at latitude
$\beta_1=\pi/2$ , longitude $\beta_2=0$}{}
\label{fig:HandPerf1SampleS}
\end{figure}
%%RChange previous figure
%SChange previous caption

\psfigurepath{./figs}
\begin{figure}[t]
\centerline{\small (a)
\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_1.eps,width=2.0in,clip=t}
\small (b)
\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_2.eps,width=2.0in,clip=t}
\small (c)
\psfig{figure=GG_Res_softH2V31-7_7_R7iM.mat.EType_3.eps,width=2.0in,clip=t}
} \mycaption{Unrestricted view model performance using multiple
sampling and $\hat{\zeta}$}{\small Multiple Samples (MS) inference
for unrestricted view tests at given viewpoint latitudes
(averaging over longitude).  Feedback functions is the estimated
$\hat{\zeta}$. A frontal view to the hand palm is at latitude
$\beta_1=\pi/2$ , longitude $\beta_2=0$. (a) Most probable sample.
(b) Average over all samples (20 most probable samples taken). (c)
Best sample (determined using ground-truth information for
comparison)}{} \label{fig:HandPerf1SampleM}
\end{figure}

\subsubsection{Experiments with Real Images}
\label{sec:HAnyRealImgs}

As before, we test our approach using video collected from a
single uncalibrated camera.  However, in this case, the person's
hand can appear at any orientation.

Pose estimates from 40 frames (taken every 0.9 secs apart)
obtained via the MO approach are shown in Fig.\
\ref{fig:RealAnyTestIE}. Note that there are incorrectly-segmented
hands in this sequence. We decided to leave these in to avoid
frame rearrangements (losing the uniform frame sampling), to show
that segmentation does not always work correctly, and to show that
this approach is inherently robust to extreme segmentation errors.
In this experiment, there was usually visual agreement between
reconstruction and estimate as seen in the figure. Note that even
for a human observer, looking at the segmented silhouettes in the
figure, reconstruction is sometimes ambiguous. There are also some
configurations for which the system did not perform correctly.

Fig.\ \ref{fig:RealAnyTestIIE} shows the estimates obtained via
the MS approach. The frames shown were taken approximately every
0.9 seconds. In the second row, we can see some limitations of the
Hu moment feature space: sometimes, different hand orientations
are very similar in the feature space. These apparently different
hypotheses may actually be close to each other in terms of their
probability, given the features. The same effect repeats clearly
in the third and sixth row. This problem might be alleviated by
using a different input feature space. At an extreme one might
consider the full silhouette as a feature. Of course there are
important trade-offs to take into account when considering
different features; e.g., invariants, and dimensionality.

\psfigurepath{./figs/RealResultsH2Unr}
\begin{figure*}[ht]
\centerline{\small
\begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00001.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00006.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00011.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00016.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00021.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00026.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00031.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00036.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00041.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00046.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00001.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00006.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00011.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00016.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00021.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00026.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00031.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00036.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00041.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00046.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00051.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00056.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00061.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00066.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00071.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00076.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00081.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00086.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00091.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00096.eps,width=0.63in,clip=t} \\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00051.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00056.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00061.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00066.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00071.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00076.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00081.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00086.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00091.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00096.eps,width=0.63in,clip=t} \\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00101.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00106.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00111.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00116.eps,width=0.63in,clip=t} &
\psfig{figure=Subsampled1_F771_1269.rle.00121.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00126.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00131.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00136.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00141.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00146.eps,width=0.63in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00101.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00106.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00111.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00116.eps,width=0.63in,clip=t} &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00121.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00126.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00131.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00136.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00141.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00146.eps,width=0.63in,clip=t}\\
%
\\
\hline
\\
%
RV &
\psfig{figure=Subsampled1_F771_1269.rle.00151.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00156.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00161.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00166.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00171.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00176.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00181.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00186.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00191.eps,width=0.63in,clip=t}&
\psfig{figure=Subsampled1_F771_1269.rle.00196.eps,width=0.63in,clip=t}\\
%
MO &
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00151.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00156.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00161.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00166.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00171.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00176.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00181.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00186.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00191.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00196.eps,width=0.63in,clip=t}\\
\end{tabular}}
 \mycaption{Estimated hand poses from real sequences using  Mean
Output (MO) algorithm and $\zeta$}{\small \CapRTestIE}{}
\label{fig:RealAnyTestIE}
\end{figure*}


\psfigurepath{./figs/RealResultsH2UnrM}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{ccccccc}
RV & MO & S1 & S2 & S3 & S4 & S12 \\
%
\psfig{figure=Subsampled1_F771_1269.rle.00151.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00151.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00051_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00160.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00160.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00054_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00169.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00169.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00057_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00178.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00178.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00060_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00187.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00187.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00063_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00196.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00196.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00066_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00214.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00214.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00072_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00223.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00223.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00075_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00001_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00064.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00064.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00022_012.eps,width=0.63in,clip=t}\\
%
\psfig{figure=Subsampled1_F771_1269.rle.00091.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7GR7iE_Fts_Subsampled1_F771_1269.mat.00091.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_001.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_002.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_003.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_004.eps,width=0.63in,clip=t}&
\psfig{figure=Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_012.eps,width=0.63in,clip=t}\\
%              Res_softH2V31-7_7R7iEM_Fts_Subsampled1_F771_1269.mat.00031_012
\end{tabular}} \mycaption{Estimated hand poses from real video (RV) sequences using
Mean Output (MO) and Multiple Samples (MS) inference.}{\small \CapRTestIIE}{}
\label{fig:RealAnyTestIIE}
\end{figure*}

\subsection{2D Human Body Pose Reconstruction}\label{sec:2DHum}

Our approach is next tested in the task of estimating human body pose.
The goal is to estimate the 2D locations of body markers in the
image, given visual features computed from the person's
silhouette. In this experiment, we use the {\it Body-All-Views}
dataset, which contains a total of of over 100,000 samples.  Of
these, 8,000 were used for training and the rest for testing. The
input-output pairs were defined as follows.  The input consisted
of the 10 Alt moments computed from the silhouette. The output
consisted of 20 2D marker positions (40 DOF), which were then
linearly encoded by nine values using PCA.

The number of \changed{mixture components in the discriminative model}
(number of specialized functions) was set to 15. This number was
determined via the MDL criterion, as before. Each specialized function
is a one hidden layer, feed-forward network with seven hidden nodes.

\subsubsection{Quantitative Results}

Fig.\ \ref{fig:ArtC} shows the reconstruction obtained with the MO
approach for frames taken from three synthetic sequences
excluded from the training set.
The agreement between reconstruction and observation is easy to
perceive for all frames. Also, for self-occluding configurations,
the estimate is still similar to ground-truth.
%It is important to
%remark that no human intervention nor pose initialization was
%required.
%RRChange, It is important to remark that
%SS: Removed. This is redundant.  I can't point to at least
% two other places
% in the paper where you say this already.

Fig.\ \ref{fig:ArtCP} shows the average marker error and variance
per body orientation in percentage of body height. Note that the
error is bigger for orientations closer to $0$ and $\pi$ radians.
This intuitively agrees with the notion that at those angles
(side-views), there is less visibility of the body parts.  We
consider this performance promising, given the complexity of the
task and the simplicity of the approach. By choosing poses at
random from those excluded from the training set, the RMSE was
10.35\% of body height (with 20\% variance). In related work,
quantitative performance has usually been ignored, in part due to
the lack of ground-truth and standard evaluation datasets.
%SChanged above paragraph

%% \begin{figure}[h]
%% \parbox[c]{0.635\textwidth}{
%% \psfigurepath{../NIPS01/epsArt}
%% \centerline{GT
%% \psfig{figure=ArtSil_00000.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00001.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00002.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00019.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00023.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000000-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000001-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000002-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000019-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000023-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } \vspace{-.2in}\rule[.0in]{4.0in}{0.01in}
%% \psfigurepath{../NIPS01/epsArt2}
%% \centerline{GT
%% \psfig{figure=ArtSil_00004.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00005.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00006.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00007.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00009.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00013.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000004-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000005-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000006-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000007-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000009-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000013-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } \vspace{-.4in}\rule[.0in]{4.0in}{0.01in} \centerline{GT
%% \psfig{figure=ArtSil_00035.Art40.eps,width=0.65in,clip=t}
%% \psfig{figure=ArtSil_00036.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00041.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00045.Art40.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=ArtSil_00049.Art40.eps,width=0.65in,clip=t}
%% } \centerline{MO
%% \psfig{figure=000035-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000036-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000041-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000045-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% \hspace*{-0.5em}\psfig{figure=000049-431602080.Art40.tif.eps,width=0.65in,clip=t}
%% } } \hfill
%% \parbox[c]{0.42\textwidth}{
%% \psfigurepath{../NIPS01/eps}
%% \centerline{\psfig{figure=ViewPointTest.eps,width=0.4\textwidth,clip=t}}}
%% \mycaptionS{\small Left: Example reconstruction of several test
%% sequences with CG-generated silhouettes. Each set consists of
%% input images and reconstruction (every 5th frame). Right: Marker
%% root-mean-square-error and variance per camera viewpoint (every
%% $2\pi/32$ rads.). Units are percentage of body height. Approx.
%% 110,000 test poses were used. } \label{fig:ArtC}
%% \end{figure}


\psfigurepath{../NIPS01/epsArt/}
\begin{figure}[t]
\centerline{\small
\begin{tabular}{rccccccccc}
GT &
\psfig{figure=ArtSil_00000.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00001.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00002.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00019.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00023.Art40.eps,width=0.65in,clip=t} &
\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00004.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00005.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=ArtSil_00006.Art40.eps,width=0.65in,clip=t} \\
MO &
\psfig{figure=000000-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000019-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000023-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000004-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000005-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}
\psfig{figure=000006-431602080.Art40.tif.eps,width=0.65in,clip=t} \\
\\
\hline
\\
GT &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00007.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00009.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00013.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00035.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00036.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00038.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00041.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00045.Art40.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=ArtSil_00049.Art40.eps,width=0.65in,clip=t} \\
MO &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000007-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000009-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000013-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000035-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000036-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000038-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000041-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000045-431602080.Art40.tif.eps,width=0.65in,clip=t} &
\psfigurepath{../NIPS01/epsArt2/}\psfig{figure=000049-431602080.Art40.tif.eps,width=0.65in,clip=t} \\
\end{tabular}
}\hfill \mycaptionS{\small Example reconstruction of frames from 
test sequences with computer graphics-generated silhouettes.
%Each %several
%set consists of input images and reconstruction.
% (every 5th frame).
} \label{fig:ArtC}
\end{figure}


\psfigurepath{../NIPS01/eps}
\begin{figure}[t]
\centerline{\psfig{figure=ViewPointTest.eps,width=0.45\textwidth,clip=t}}
\mycaptionS{\small Marker root-mean-square-error and variance per
camera viewpoint (every $2\pi/32$ rads.). Units are percentage of body
height. Approx.  110,000 test poses were used.}
\label{fig:ArtCP}
\end{figure}


\subsubsection{Experiments with Real Images}

We now test the approach using real video sequences of human body
motion. We use the basic segmentation approach described in Sec.\
\ref{sec:BodyDet} to obtain silhouettes.

Fig.\ \ref{fig:ExampR0} shows examples of system performance
obtained via the MO approach for several relatively complex motion
sequences. Even though the characteristics of the segmented body
differ from the ones used for training, good performance is still
achieved. Most reconstructions are visually close to what can be
thought of as the right pose reconstruction. Body orientation is
also generally accurate.

%We used 60 specialized functions, each one was a MLP with five
%hidden nodes.

Fig.\ \ref{fig:RealBodyMS} shows the top-ranked pose samples obtained
via the MS approach.  Note that despite low-quality segmentation, the
system outputs reasonably accurate pose hypotheses. Orientation is
accurate and the relative limb relationships are maintained. However,
we can observe that some poses are inherently difficult and the
estimate lacks enough pose detail to be perceived as a good
estimate. For example, the eighth row shows a side view of a person
raising one arm while keeping the other arm at rest. The resulting MS
estimates all show a side-view, however none has the correct arm
configuration. \changed{ This could be due to the lack of relevant
training data, as a consequence the discriminative model $q$ may not
approximate the generative model $p$ very well around the input
vector}. Another important reason are the visual differences between
the rendered model and the real body observed.

%One difference with respect to the hand pose estimation task is
%that the rendering quality or realism for body pose is poorer for
%the human body renderer.
In this work, we did not pursue use of a more realistic human body
renderer.  This could affect the performance with real data since,
as in most learning methods, it is critical that the training data
be a good approximation to the data the algorithm will be tested
with. Due to differences in shape and width of body components
observed in training versus testing, the visual features may
differ. Improving the match between visual features used in
training and testing is an area that we plan to investigate in
future research. In theory this could allow us to adapt our
algorithm to different body or hand anthropometric
characteristics.

\psfigurepath{../NIPS01/eps}
\begin{figure}[h]
\centerline{\small \begin{tabular}{rcccccccccc}
RV &
\psfig{figure=Sil_00001.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00003.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00004.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00005.1.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00000.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00001.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.2.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00000.3.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00001.3.eps,width=0.65in,clip=t}\\
MO &
\psfig{figure=000001-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000003-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000004-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000005-431602080.1.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000000-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.2.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000000-431602080.3.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000001-431602080.3.tif.eps,width=0.65in,clip=t}\\
\\
\hline
\\
RV &
\psfig{figure=Sil_00001.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00002.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00003.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00004.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00005.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00006.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00007.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00008.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00008.4.eps,width=0.65in,clip=t} &
\psfig{figure=Sil_00010.4.eps,width=0.65in,clip=t} \\
MO &
\psfig{figure=000001-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000002-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000003-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000004-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000005-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000006-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000007-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000008-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000009-431602080.4.tif.eps,width=0.65in,clip=t} &
\psfig{figure=000010-431602080.4.tif.eps,width=0.65in,clip=t}\\
\end{tabular}
} \caption{\small Reconstruction obtained from observing a human
subject (every 10th frame).} \label{fig:ExampR0}
\end{figure}


\psfigurepath{./figs/ResRealBodyMS}
\begin{figure*}[ht]
\centerline{
\begin{tabular}{cccccc}
RV & S1 & S2 & S3 & S4 & S12 \\
%
\psfig{figure=Sil_00001.1.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.1.tif.eps,width=0.65in,clip=t} \\
%
\psfig{figure=Sil_00003.1.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S002.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S005.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S008.1.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00005.1.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S001.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S003.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S005.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S006.1.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S011.1.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.2.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S010.2.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S011.2.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.3.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S003.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S004.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S005.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.3.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.3.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.4.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S007.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00003.4.eps,width=0.65in,clip=t}&
\psfig{figure=00002_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00002_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S007.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S008.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00003_S010.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00005.4.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00005_S007.4.tif.eps,width=0.65in,clip=t} \\
%
\psfig{figure=Sil_00007.4.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00007_S005.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00009.4.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S006.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00009_S010.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00010.4.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S003.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S004.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S005.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00010_S006.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00012.4.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S001.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S002.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S007.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S009.4.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00012_S012.4.tif.eps,width=0.65in,clip=t}\\
%
\psfig{figure=Sil_00001.5.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S004.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S008.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S009.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S011.5.tif.eps,width=0.65in,clip=t}&
\psfig{figure=00001_S012.5.tif.eps,width=0.65in,clip=t}\\
\end{tabular}}
\mycaptionS{Estimated body poses from real sequences obtained via
MS inference.}\label{fig:RealBodyMS}
\end{figure*}


\section{Conclusions}

\label{sec:Dis}

\changed{
In this paper, we have described a novel method that allows us to
combine generative and discriminative models for solving complex
probabilistic inference problems, in particular inferring 3D and 2D
articulated body pose from a series of observed visual features in a
single image. This approach is most useful when the generative model
is accurate (\ie we count with an inverse mapping function) but it is
very difficult to perform probabilistic inference using this model
alone.}

\changed{
In order to solve the inference problem (and also perform MAP
estimation), we have shown that a mathematically sound approach is to
use a discriminative model and learn its parameters using relevant
training data. The probability distribution implied by the
discriminative model can be used as a proposal distribution to
generate samples and find a posterior probability distribution
(perform approximate inference) under the (accurate but complex)
generative model.}


%q=p, then done???? OIK

%% In this paper, we have described a novel method that allow us to
%% combine generative and discriminative models for proabbilistic
%% inference. The SMA employs a set of several mapping functions that are
%% learned from training data. Each specialized function maps certain
%% domains of the input space onto the output space. The SMA learning
%% formulation uses ideas from Maximum Likelihood estimation and latent
%% variable models. A variant of the Expectation-Maximization algorithm
%% is used for simultaneous learning of the specialized domains along
%% with the mapping functions. One key advantage of the SMA is that it
%% can model ambiguous, one-to-many mappings that may yield multiple
%% valid output hypotheses.
%% %Once learned, the mapping
%% %functions generate a set of output hypotheses for a given input
%% %via a statistical inference procedure.

%% Another key advantage of the SMA formulation is its incorporation
%% of a feedback or inverse function, $\zeta$ in statistical
%% inference. 
%% %if
%% %desired.}
%% %To the best
%% %of our knowledge, we do not know of any other probabilistic
%% %formulation undertaking these ideas.
\changed{
When comparing it to other relevant methods, we can find alternative
(dual) interpretations of this framework. The use of a generative
model (through $\zeta$) affords an alternative to the gating networks
of the Mixture of Experts paradigm (seen in place of our discriminate
model)\cite{Jordan94}} in that it allows for simpler discriminative
models (also see \cite{Hinton98,Friedman91} for other models). The
discriminative model in our approach assumes that the mixing factors
are independent of the input, as seen in Sec.\ \ref{sec:ProMod}. At
first sight, this seems to limit the architecture's
expressiveness. However, the combination of discriminative (also
referred here as 'forward') and generative models eliminates this
independence assumption. In other words, the generative model $\zeta$
provides an alternative that avoids increasing the discriminative
model complexity without restricting model expressiveness. 

%%  Note that
%% in our formulation formulation, different sets of appropriate
%% conditional independence assumptions are specified by the forward and
%% inverse models. 
%% In
%% applications such as those presented in this paper, $\zeta$ can be a
%% computer graphics rendering function or an approximation $\hat{\zeta}$
%% can itself be learned from training data.  Thus, the SMA exploits
%% available prior information about the structure of the problem.


%%RRChange This allowed us... [very important]
%%SS: OK.  I reworded slightly to make it clearer/shorter.
%%RR: If you remove 'if desired' it would be OK, since SMA needs them both
%%SS: Hope it's OK with you.
%%RR:
%% I would prefer this, hope it is clear what's the point from my email
%% 'Another key advantage of the SMA formulation is its incorporation of a
%% feedback or inverse function, $\zeta$ in statistical inference. This
%% allowed us to derive an inference method was based on the possibility
%% of alternatively use different sets of conditional independence
%% assumptions specified by the forward and inverse models'
%%RRChange To the best of our knowledge, we do not know of any other probabilistic formulation undertaking these ideas. [I think we should emphasize the novelty here]
%%SS: I removed this.  added word ``novel'' in prior sentence.
%%RR: Do you think it is too risky to say that? or why did you remove it?
%%RRChange [deleted] ....learned from training data \footnote{It is important to add that the use of $\zeta$ does not limit the possibility of having multi-modal posteriors over $\mb{x}$.}
%%SS: OK

Our approach was demonstrated in a computer vision system that can
estimate the articulated pose parameters of a human body or human
hands, given features computed from an image silhouette.  Articulated
pose reconstruction from a single image is a particularly difficult
problem because this mapping is highly-ambiguous and complex.  We have
obtained promising results even using a very simple set of image
features, such as moment invariants of the hand or body's image
silhouette.  Choosing the best subset of image features for this
application is by itself a complex problem, and a topic of ongoing
research.

This approach offers several advantages over many previous methods for
articulated pose estimation.  Many previous approaches have tried in
numerous ways to use camera geometry and/or model registration to
perform pose estimation, resulting in iterative procedures that
require careful choice of initial conditions (model placement). We
have shown how in some cases these alternative approaches could be
seen as inferring a posterior distribution using the generative model
only. In this approach no iterative minimization methods are used in
pose inference. Moreover, inference is fully automatic -- no manual
initialization of the articulated model is required.  Another set of
previous approaches attempt to learn articulated model dynamics
\cite{Brand99,Howe99,Perona00}; however, learning dynamics requires
substantially more training data, and tends to produce systems that
are biased towards specific motions. Our framework avoids this and
learns/estimates pose from a single image only.

It is also important to note that the this approach is general. Thus,
applications need not be limited to the vision domain. As a simple
example, one could apply this approach in speech recognition problems,
where the input space is given by features computed on acoustic
signals (\eg cepstral coefficients), and the output space could be the
space of phonemes. In this case, the generative model (feedback
function) would involve an acoustical rendering of phonemes.

Several interesting problems remain for future work.  Within the
context of articulated pose estimation, one topic for future
investigation is how to adapt the system to a specific body
morphology; one of the major issues affecting performance. Integration
of pose estimation with image segmentation for a fully-integrated
detection and pose reconstruction formulation would also be highly
desirable, and may enable greater robustness to occlusion and
noise. More generally, methods for incorporating knowledge of dynamics
in the our framework should be investigated, as discussed in
\cite{RosalesPhDThesis}. Another general problem is how to learn what
the best (\eg visual) features are for specific problems or
datasets. While promising advances have been made in boosting of
features \cite{Freund95}, extension of our framework to incorporate
such concepts remains a topic for future investigation.
%%% Adaptive extra learning in the q model

\section*{Acknowledgments} The hand sequences used in our
experiments were collected in collaboration with Vassilis Athitsos.
We thank Tommi Jaakkola, Quaid Morris, and Matt Brand for suggestions
and interesting discussions. This research was supported in part by
the U.S.\ Office of Naval Research under grants N000140310108 and
N000140110444, and the U.S.\ National Science Foundation under grants
IIS-0208876 and IIS-9809340.

\renewcommand{\baselinestretch}{1}
\bibliography{thesis}

\end{document}
