Commit 4e15469d997bf5ab85d62eabc5a73f051d65115e

Final version
biblio.bib
(19 / 0)
  
420420 publisher = "{ACM} Press",
421421 year = 2006,
422422}
423
424@article{Kirkpatrick1983,
425author = {Kirkpatrick, S. and Gelatt, C. D. and Vecchi, M. P.},
426title = {Optimization by Simulated Annealing},
427volume = {220},
428number = {4598},
429pages = {671--680},
430year = {1983},
431journal = {Science}
432}
433
434@article{bib:Iacca2012,
435 author = {G. Iacca and F. Neri and E. Mininno and Y. S. Ong and M. H. Lim},
436 title = {Ockham's Razor in Memetic Computing: Three Stage Optimal Memetic Exploration},
437 journal = {Information Sciences},
438 year = {2012},
439 Volume = {188},
440 Pages = {17--43},
441}
opt_comparison.tex
(513 / 155)
  
11\documentclass{svjour3}
22\usepackage[latin1]{inputenc}
33\usepackage[T1]{fontenc}
4%\usepackage{lmodern}
54\usepackage{mathptmx}
65\usepackage{amssymb}
76\usepackage{subfigure}
87\usepackage{graphicx}
8\usepackage{rotating}
99
10\journalname{Soft Computing}
1011\begin{document}
1112
12\title{Comparing Optimizers:\\ A Dispute against the Diktat of the Mean}%{{{
13\author{Matthieu Weber \and Ilpo Poikolainen \and Sami Äyrämö \and Ville Tirronen}
13\title{Perspectives of algorithmic performance}
14\subtitle{A discussion on the relevance of the sample mean for comparing stochastic algorithms}%{{{
15\author{Ilpo Poikolainen \and Matthieu Weber \and Sami Äyrämö \and Ville Tirronen}
1416\institute{%
15Matthieu Weber \at
16\email{matthieu.weber@jyu.fi}\\
17\and
1817Ilpo Poikolainen \at
19\email{ilpo.poikolainen@jyu.fi}\\
18\email{ilpo.poikolainen@jyu.fi}
2019\and
20Matthieu Weber \at
21\email{matthieu.weber@jyu.fi}
22\and
2123Sami Äyrämö\at
2224\email{sami.ayramo@jyu.fi}
2325\and
2426Ville Tirronen\at
25\email{ville.tirronen@jyu.fi}\\
27\email{ville.tirronen@jyu.fi}
2628\and
2729\bigbreak
2830University of Jyväskylä\\
3737\maketitle
3838
3939\begin{abstract}
40Some text
40The comparison of stochastic algorithms, such as computational intelligence or
41data mining algorithms, is generally performed by comparing samples of results
42produced by repeatedly running these algorithms. The sample mean is often used
43as a measure of algorithmic performance, in relation to the assumption that
44the results produced by these algorithms follow a normal probability
45distribution. Using both formal and experimental counterexamples, this article
46shows the assumption to be wrong in general and investigates the qualitative
47interpretation of usual statistics such as the sample mean, the standard
48deviation and the quartiles in the context of the comparing the performance of
49such algorithms. After showing that the sample mean cannot be satisfactorily
50interpreted as a measure of algorithmic performance, four perspectives are
51suggested for that purpose: accuracy, reliability, robustness and efficiency,
52for which formal definitions and practical interpretations are given.
53Performing multiple-function comparisons of multiple algorithms requires to
54rank the algorithms, a task generally performed by comparing the sample means of
55the data produced by the algorithms. This article then investigates, using two
56study cases, the use of accuracy, reliability and robustness as alternatives
57to the sample mean, leading to results that better correspond to the
58qualitative interpretation that can be be made from the raw data produced by
59the algorithms.
60
4161\keywords{Computational Intelligence \and Statistical Analysis \and
42Statistical Model \and Data Mining}
62Statistical Model \and First Order Statistic \and Pairwise Comparison \and
63Multiple Comparison \and Data Mining}
4364\end{abstract}%}}}
4465
4566\section{Introduction}\label{sec:intro}%{{{1
8181function of the size of the sample; repeated measurements therefore lead to a
8282better precision of the quantity that is measured.
8383
84The behavior of CI and DM algorithms, yielding slighlty different results each
84The behavior of CI and DM algorithms, yielding slightly different results each
8585time they are run, seems to be similar to that of the measurement of a physical
8686quantity, and common practice heavily leans toward using the sample mean as a
8787measure of the performance of an algorithm. The following examples have been
106106test \cite{bib:Wilcoxon1945} is applied to pairwise comparisons, over all the test functions
107107considered in the study, between the best of the four variants and the
108108reference algorithms. In \cite{Qian2012} a self-adaptive differential
109evolution with $\alpha$-contrained-domination is applied to multi-objective
109evolution with $\alpha$-constrained-domination is applied to multi-objective
110110problems; the new algorithm is compared against two state-of-the-art
111111algorithms and minimum, maximum, median, mean and standard deviation values
112112are presented, as well as the hypervolume indicator, the unary $\epsilon$
126126as sets of mean fitness values and standard deviations. Additionally, the
127127t-test is used for comparing the two algorithms. In \cite{Sun2012} a
128128cooperative PSO is presented and compared to reference, cooperative
129algorithms; the performance of the proposed algorithm is prensented as a set
129algorithms; the performance of the proposed algorithm is presented as a set
130130of mean fitness values and their variance. In \cite{Sundar2012}, an ABC
131131algorithm hybridized with a local search algorithm is applied to the
132132early/tardy scheduling problem; the average deviation from the known optimum
174174experimentally study the variations of these different estimators on a
175175several algorithms applied to the CEC'08 benchmark.
176176
177\section{Interpretations of the mean, the standard deviatiom and the quantiles}%{{{1
177\section{Interpretations of the mean, the standard deviation and the quantiles}%{{{1
178178
179179The arithmetic mean is mathematically defined for most common continuous
180180probability distribution as the expected value of a random variable $X$ with
190190the mean indicates by how much the center of a given normal distribution is
191191shifted away from 0. This interpretation is useful when dealing with the
192192measurement of physical quantities, where the measurement equipment always
193introduces a small error. In such cases, repeated measurements yield slightly
194different values; in many cases, thes values are normally distributed.
195Estimating the mean of that distribution from a sample using the sample mean
196allows to estimate the ``true'' value of the physical quantities
197that one wants to measure. The variance of the sample mean (and therefore the
198variance of the measured value of the quantity with respect to its ``true''
199value) is (see e.g., \cite{upton1996})\begin{equation}\hat\sigma^2 =
200\sigma^2/N\end{equation} where $\hat\sigma^2$ is the variance of the sample
201mean, $\sigma^2$ is the variance of the distribution from which the sample is
202taken and $N$ is the size of the sample. One may assume that this
203procedure, which is perfectly valid in the case of measuring a physical
204quantity with an equipment that introduced noise in the measurement has
205prompted the now widespread use of the sample mean as a method for estimating
206the performance of a stochastic algorithm. The sample produced by repeatedly
207running a stochastic algorithm on a given function cannot however be
208interpreted as one ``true'' value that is perturbed by noise, and the method
209described above must therefore not be blindly applied when comparing
210algorithms.
193introduces a small, random error. In such cases, repeated measurements yield
194each time a slightly different value; in many cases, these values are
195considered to be normally distributed. Estimating the mean of that
196distribution from a sample using the sample mean allows to estimate the
197``true'' value of the physical quantities that one wants to measure. The
198variance of the sample mean (and therefore the variance of the measured value
199of the quantity with respect to its ``true'' value) is (see e.g.,
200\cite{upton1996})\begin{equation}\hat\sigma^2 = \sigma^2/N\end{equation} where
201$\hat\sigma^2$ is the variance of the sample mean, $\sigma^2$ is the variance
202of the distribution from which the sample is taken and $N$ is the size of the
203sample. One may conjecture that this procedure, which is perfectly valid and
204widely accepted in the case of measuring a physical quantity with an equipment
205that introduced noise in the measurement has prompted the now widespread use
206of the sample mean as a method for estimating the performance of a stochastic
207algorithm. The sample produced by repeatedly running a stochastic algorithm on
208a given function cannot however be interpreted as one ``true'' value that is
209perturbed by noise, and the method described above must therefore not be
210blindly applied when comparing algorithms.
211211
212212The standard deviation, defined as the square-root of the variance, is a
213213statistic that is often associated with the mean, and interpreted as a measure
222222its probability density function being $f(x) = \lambda e^{-\lambda x},\,x>0$
223223its standard deviation is always equal to its mean i.e., $\mu = \sigma =
2242241/\lambda$. The rule of thumb presented above is not anymore applicable since
225$\mu - \sigma = 0$, which is the minumum of the distribution's support
225$\mu - \sigma = 0$, which is the minimum of the distribution's support
226226interval: there cannot be any data below that point. Using the rule of thumb
227227without knowing from what probability distribution the data comes is therefore
228228misleading.
229229
230230Another property of symmetrical distributions is that the value of the mean
231231coincides with the mode i.e., the value with the highest frequency. When
232considering a stochastc process following a given distribution, the mode can
232considering a stochastic process following a given distribution, the mode can
233233be interpreted as the most probable outcome. One may thus consider that values
234234``close'' to the mode are common, typical values, as opposed to extreme values
235235(from the tails of the distribution) that happen rarely. In the case of
236236processes where a symmetrical distribution is assumed, the neighborhood of the
237mean therefore estimates the most probable outcomes. This is howerver not
238necessary the case anymore for asymetrical distributions, where the mode and the mean
239are distinct, and if the distribution is skewed enought, the assumption that
237mean therefore estimates the most probable outcomes. This is however not
238necessarily the case anymore for asymmetrical distributions, where the mode and the mean
239are distinct, and if the distribution is skewed enough, the assumption that
240240the neighborhood of the mean contains the most probable outcome does not hold
241241anymore.
242242
243243The mean can also be interpreted as a ``best guess'' of what would be the
244value of a single outcome. It incorportates information about the spread of the
244value of a single outcome. It incorporates information about the spread of the
245245probability distribution, which makes the sample mean sensitive to outliers.
246246More exactly, if a sample contains a large value (which may happen with a low,
247247but non-null probability), the sample mean will be shifted towards this value,
273273has a probability $\geq 0.5$ to produce a solution at least as good as $m_B$.
274274Reciprocally, algorithm $B$ has a probability $\leq 0.5$ to produce a solution
275275at least as good as $m_A$. The value of $m_B$ (resp. $m_A$) in this example is
276intepreted as a threshold below which a solution can be considered ``good'';
276interpreted as a threshold below which a solution can be considered ``good'';
277277the algorithm with the lowest median has a higher or equal probability of being able to
278278produce a solution at least as good as this threshold. This interpretation can
279279be generalized to any $q$-quantile:\begin{eqnarray}Pr[X_A \leq x_B] & = & Pr[X_A
287287x_B]\end{equation} meaning that $A$ has a higher probability than $B$ to find a
288288solution at least as good as $x_B$.
289289
290On the contrary to the mean, quantiles (and particluarly the median) are less
290On the contrary to the mean, quantiles (and particularly the median) are less
291291sensitive to outliers. More precisely, the presence of an outlier, however
292292far from the median, does not excessively modify its value. In cases where the
293293location of the outliers does not matter, the median is therefore a more
301301
302302Let us consider the problem of minimizing a continuous function of $n$
303303real-valued variables $f: D \rightarrow \mathbb{R}$, where $D \subset
304\mathbb{R}^n$, $D = [a_1, b_1]\times \ldots [a_n, b_n]$. Function $f$ is often refered to as a \emph{fitness
304\mathbb{R}^n$, $D = [a_1, b_1]\times \ldots [a_n, b_n]$. Function $f$ is often referred to as a \emph{fitness
305305function}. Let us then assume that a meta-heuristic optimization
306306algorithm~$A$ (e.g, Simulated Annealing, Evolution Strategy, Differential
307307Evolution, Particle Swarm Optimization\dots) is used for solving the
332332following.
333333
334334As stated before, the result produced by the process $(A,f)$ is the smallest
335fitness value in set $V_i$; this corresponds to the smallest value in sample
336$V$ and is usually called the first order statistic of that sample, noted
337$Y_{(1)}$. When process $(A,f)$ is repeated multiple times, the value of the
338first order statistic $Y_{(1)}$ varies, due to the stochasticity of the
339process. Knowing the probability density function $\phi(y)$ of $Y \in V$ and
340ensuring that the random variables $Y \in V$ are independent and identically
341distributed, it is possible to derive function $g(y)$, the probability density
342function of the first order statistic $Y_{(1)}$ as \cite{Arnold1992}
343\begin{equation} g(y) = N\phi(y)(1-\Phi(y))^{N-1}
344\label{eq:g} \end{equation} where $N$ is the size of sample $V$ i.e., the
345budget of the process $(A,f)$. The analytic form of $g$ is not trivial,
346especially when $f$ is a complicated function. However, one can have a
347qualitative approach towards describing the shape of $g$ for various values of
348$N$, by making a number of simplifications.
335fitness value in set $V$; this is usually called the first order statistic of
336that sample, noted $Y_{(1)}$. When process $(A,f)$ is repeated multiple times,
337the value of the first order statistic $Y_{(1)}$ varies, due to the
338stochasticity of the process. If the probability density function
339$\phi(y)$ of $Y \in V$ is known and if the random variables $Y \in V$ are
340independent and identically distributed, it is possible to derive function
341$g(y)$, the probability density function of the first order statistic
342$Y_{(1)}$ as \cite{Arnold1992} \begin{equation} g(y) =
343N\phi(y)(1-\Phi(y))^{N-1} \label{eq:g} \end{equation} where $N$ is the size of
344sample $V$ i.e., the budget of the process $(A,f)$. The analytic form of $g$
345is not trivial, especially when $f$ is a complicated function. However, one
346can have a qualitative approach towards describing the shape of $g$ for
347various values of $N$, by making a number of simplifications.
349348
350349\subsection{The simplified case}%{{{2
351350
352352function using a trivial optimization algorithm. Using a trivial function
353353allows to easily determine the analytic form of the probability density
354354function involved, and a trivial optimization algorithm can ensure that the
355sampling is independent and identically disributed.
355sampling is independent and identically distributed.
356356
357357Reusing the notations introduced above, let us consider the function $f:
358358[a,b]^n \rightarrow \mathbb{R}$ where $a<b$, such that for $x =
359359(x_1,\dots, x_n)$, $f(x) = \sum_{j=1}^n x_j$. A random-search optimization
360360algorithm, $A$, is applied to $f$ in order to find the function's minimum. The
361361random-search algorithm is trivial: one solution $x_k \in [a,b]^n$ is
362generated by sampling the interval $[a,b]$ $n$ times unsing a
363uniformely-distributed random number generator. A set of $N$ such solutions
362generated by sampling the interval $[a,b]$ $n$ times using a
363uniformly-distributed random number generator. A set of $N$ such solutions
364364is then created by repeating this process. By applying the fitness function
365365$f$ to each such solution $x_k$, a set of fitness values $V_i:\{y_1,\dots,
366366y_N\}$ is generated, where $y_k = f(x_k)$; the process $(A,f)$ being run
367multiple times, $i$ is in this contex the index of this particular run.
367multiple times, $i$ is in this context the index of this particular run.
368368The best fitness value found by the algorithm during this run, $(A,f)_i = \min
369369V_i$, is then returned.
370370
371In the context of generating $x = (x_1,\dots, x_n)$, each value $x_j \in x$
372can be considered as one possible value of a uniformely-distributed random
373variable $X_j \in X$ with $X$ a vector of random variables $X = (X_1,\dots,
371In the context of generating $x = (x_1,\dots, x_n)$, each value (parameter) $x_j$
372can be considered as a uniformly-distributed random number $X_j$
373forming a vector of random variables $X = (X_1,\dots,
374374X_n)$. The fitness value $y = f(x)$ can thus be considered as one possible
375375value of the random variable $Y = f(X) = \sum_{j=1}^n X_j$. The analytical
376376expressions of the probability density and cumulative density functions of $f$
466466presented in Figure~\ref{fig:first-order} are generally observed on the data.
467467For illustration purposes, the Differential Evolution \cite{bib:DEbook} and
468468CMA-ES \cite{bib:Hansen2003} have been applied to function $F1$ to $F6$ of the
469CEC2008 benchmark \cite{bib:cec2008}, in 20 dimensions. Each combination of
469CEC 2008 benchmark \cite{bib:cec2008}, in 20 dimensions. Each combination of
470470optimizer and function has been run 1000 times in order to have detailed
471471histograms.
472472
685685samples $S_A$ and $S_B$, and can be separated into two categories: the
686686comparison of a single statistic computed from the sample, such as the
687687sample mean, and the use of a statistical test to compare the two samples.
688Setting momentarily aside the simple comparison of statistics from the sample,
689we can make the following criticism about the use---and abuse---of statistical
690tests.
688Graphical methods based on modified histograms have been suggested for
689comparing two or more samples, such as in article \cite{Tirronen2010}, but
690these require a large amount of space for publishing and leave the
691interpretation to the reader, which is why they will not be considered in this
692study. Setting momentarily aside the simple comparison of statistics from the
693sample, we can make the following criticism about the use---and abuse---of
694statistical tests.
691695
692696The following tests all consider as null hypothesis that both samples $S_A$
693697and $S_B$ have been drawn from the same probability distribution. A rejection
725725other \cite{Fay2010}.
726726
727727\item The Kolmogorov-Smirnov two-sample test (see e.g., \cite{wilcox2012})
728measures the largest difference between the emprirical distributions of the
728measures the largest difference between the empirical distributions of the
729729samples in order to tell whether the samples are drawn from the same
730730distribution (this being the test's null hypothesis) or not. When the null
731731hypothesis is rejected by the test, one must still find a way to decide which
734734\end{itemize}
735735
736736Probability distributions are characterized by a number of statistics such as
737the mean, the variance, the mediam the skewness, the excess kurtosis\dots
737the mean, the variance, the median the skewness, the excess kurtosis\dots
738738Regarding the comparison of samples based on a single statistic, one must
739739therefore ask oneself what characteristic of the algorithm is compared by this
740740process. The sample mean and media for example are measures of the central
757757result of algorithm $A$ is much worse than any of the results produced by
758758algorithm $B$.
759759
760\begin{table}
761\caption{Average, Median, Min and Max Fitness}\label{tab:avg_med_min_max}
760\begin{table}%{{{3
761\caption{Mean, median, minimum and maximum fitness on functions \emph{f8} and
762\emph{f9} of BBOB in 10 dimensions}\label{tab:avg_med_min_max}
762763\begin{tabular}{l|c|c|cc|c|c|cc}
763764\hline\hline
764765 & \multicolumn{4}{c|}{SADE} & \multicolumn{4}{c}{(1+1) CMA-ES}\\
765766\hline
766 & Average & Median & Minimum & Maximum & Average & Median & Minimum & Maximum\\
767 & Mean & Median & Minimum & Maximum & Mean & Median & Minimum & Maximum\\
767768\hline
768769f8 & $ 2.39e-02$ & $ 8.53e-14$ & $ 0.00e+00$ & $ 3.99e+00$ & $ 8.73e-01$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$\\
769770f9 & $ 3.88e-02$ & $ 1.07e-07$ & $ 0.00e+00$ & $ 3.99e+00$ & $ 7.89e-01$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$\\
770771\hline\hline
771772\end{tabular}
772\end{table}
773\end{table}%}}}3
773774
774\begin{table}
775\caption{Wilcoxon's Rank-Sum test on the Fitness (reference = SADE)}\label{tab:avg_wilcoxon}
775\begin{table}%{{{3
776\caption{Wilcoxon's Rank-Sum test on the Fitness on functions \emph{f8} and
777\emph{f9} of BBOB in 10 dimensions (the reference algorithm is SADE). The $-$
778(minus) sign indicates that (1+1) CMA-ES has stochastically smaller values
779than SADE.}\label{tab:avg_wilcoxon}
776780\begin{tabular}{l|c}
777781\hline\hline
778782 & (1+1) CMA-ES\\
779783\hline
780f8 & -\\
781f9 & -\\
784f8 & $-$\\
785f9 & $-$\\
782786\hline\hline
783787\end{tabular}
784\end{table}
788\end{table}%}}}3
785789
786790Tables~\ref{tab:avg_med_min_max}
787791and~\ref{tab:avg_wilcoxon} illustrate this situation with real experimental
788792results: the Self-Adaptive Differential Evolution \cite{bib:Qin2009} (SADE)
789793and the (1+1) CMA-ES \cite{bib:Igel2006} algorithms were applied to functions
790f8 and f9 of the Blackbox optimization benchmark 2010 \cite{bib:BBOB2010} in
794f8 and f9 of the Blackbox optimization benchmark (BBOB) 2010 \cite{bib:BBOB2010} in
79179510 dimensions with a budget of $50\,000$ function evaluations and for
792796$1\,000$ repetitions. Their minimum and maximum values are the same, but SADE
793797exhibits a lower sample mean. The Wilcoxon rank-sum test however indicates
804804\caption{Bimodal distribution of the results of (1+1) CMA-ES on BBOB function f8}\label{fig:bimodal}
805805\end{figure}
806806
807A closer examinaton of the samples produced by these algorithms on function f8
807A closer examination of the samples produced by these algorithms on function f8
808808indicates that their distributions are bimodal, one mode corresponding to the
809809function's global optimum, and the other one corresponding to a local optimum,
810810with no data between those, as illustrated in Figure~\ref{fig:bimodal}.
821821result. Depending on the type and number of these local optima, the
822822distribution of the algorithm's result may present more than one mode.
823823
824\subsection{Perspectives of algorithmic performance}%{{{2
824\subsection{Perspectives of algorithmic performance}\label{sec:perspectives}%{{{2
825825
826826In the light of the examples above, one must accept that a single statistic is
827827not enough for comparing two algorithms. This is due to the fact that
838838repeatedly running algorithm $A$, the sample minimum is the best possible
839839estimate of the problem's minimum and we therefore define
840840the accuracy as \begin{equation}\alpha_{(A,f)} = \min S_A\end{equation}
841In a context where one needs to estimate a problem's optimum, the algorithm
842with the lowest accuracy has therefore a better chance to produce a result close
843to that optimum than an algorithm with a higher accuracy.
841844
842845Reliability indicates the algorithm's ability to repeatedly produce
843846``good'' results. We define the reliability $\Theta_{(A,f)}$ of algorithm $A$ as
844847the probability (estimated from the sample) for the algorithm to produce a
845848result that is below a given threshold $\theta_f$
846849\begin{equation}\Theta_{(A,f)} = \frac{|S'_A|}{|S_A|} : \forall s \in S'_A,\,
847s \leq \theta_f\end{equation} The reliability can alse be defined based on the
850s \leq \theta_f\end{equation} The reliability can also be defined based on the
848851empirical distribution function $\hat G(x)$ of $S_A$ (see e.g.,
849852\cite{barrio2007}), as \begin{equation}\Theta_{(A,f)} = \hat
850853G(\theta_f)\end{equation} The threshold $\theta_f$ marks the boundary between
863863value in a manner that depends on the algorithms being compared. We therefore
864864suggest to compute $\theta_f$ as the 10$^\mathrm{th}$ percentile of the union
865865of all the results produced by repeatedly running $T$ times all the algorithm
866being compared.
866being compared. In a context where the algorithm can be run repeatedly and the
867best solution is considered (e.g., an engineering problem), the algorithm with
868the better reliability will require less repeats to produce a ``good'' results
869and therefore produce this results faster.
867870
868871Robustness is a measure of the algorithm's ability not to produce too
869872``bad'' results. Depending on the context in which a problem needs to be
886886function: the presence of an extreme value indicates that under some
887887circumstances, and with a probability that can be estimated by the frequency
888888of these extreme values, such results happen and must be taken into
889consideration.
889consideration. In a context where the algorithm can be run only once (e.g, a
890online optimization problem), reliability does not matter so much as the
891algorithm's ability not to produce ``too bad'' results. A better robustness
892therefore indicates an algorithm that is less prone to producing bad results;
893this may be especially important in situations where ``bad'' solutions must be
894avoided at all cost.
890895
891896Finally, efficiency indicates the complexity of the algorithm and the time
892897required for running the algorithm. It is a measure of the total time required
899899is separate from the three above perspectives, as it does not relate to the results
900900produced by the algorithm, but rather to the time required to produce these
901901results and very much depends on the computer hardware that is used for
902running the algorithm. Depending on the context of a problem (e.g., time constraints), an
903efficient algorithm may be preferable to a reliable one. Regarding the method
904for measuring the runnning time of a computer program, the best practice is to
905repeatedly measure the time required for running the program, and to consider
902running the algorithm. Depending on the context of a problem (e.g., time
903constraints, hardware with limited processing power\dots), an efficient
904algorithm may be preferable to a more reliable or robust one in order to be
905able to comply with the constraints. Regarding the method for measuring the
906running time of a computer program, the best practice is to repeatedly
907measure the time required for running the program, and to then consider only
906908the smallest of these time values; this is due to the fact that the
907imprecision due to other programs running at the same time may only add to the
908measured time, never make it shorter.
909imprecision due to other programs running at the same time on the computer
910used for the measurement may only add to the measured time, never make it
911shorter.
909912
910913In the light of the above perspectives, one may then wonder what
911914interpretation could be given of the sample mean. Its sensitivity to extreme
931931
932932Given a reliability value $\Theta$, one can estimate the number of times
933933$(A,f)$, having a reliability of $\Theta$, needs to be repeated to
934obtain, with a given probability THR, a result $A(f)$ at least as good as
934obtain, with a given probability THR, a result $(A,f)_i$ at least as good as
935935$\theta$.
936936
937937By application of the definition of $\Theta$, the empirical
938938probability that the algorithm produces at least once a result lower or
939939equal to $\theta$ when repeating running it $t$ times is
940\begin{equation}Pr[A(f) \leq \theta] = 1-(1-\Theta)^t\end{equation} We can then
940\begin{equation}Pr[(A,f)_i \leq \theta] = 1-(1-\Theta)^t\end{equation} We can then
941941derive the number of times $t$ that the process $(A,f)$ needs to be repeated
942such that $Pr[A(f) \leq \theta] \geq \mathrm{THR}$: \begin{equation}t \geq \frac{\log(1 -
942such that $Pr[(A,f)_i \leq \theta] \geq \mathrm{THR}$: \begin{equation}t \geq \frac{\log(1 -
943943\mathrm{THR})}{\log(1 - \Theta)}\label{eq:t}\end{equation} By
944944convention, we set $t = \infty$ in the case $\Theta = 0$ and $t = 1$ in the
945945case $\Theta = 1$.
964964its meaning regarding the performance of an algorithm on a given function.
965965Several alternatives to the sample mean have been proposed, each of them
966966allowing to measure the algorithm's performance in different perspectives.
967Since the context in which the algorihtm is employed defines the perspective
967Since the context in which the algorithm is employed defines the perspective
968968according to which the performance needs to be measured, we suggest to make
969969several comparisons following the methods proposed in \cite{Derrac2011}, each
970of them based on a different perspective. It therfore gives the possibility to
970of them based on a different perspective. It therefore gives the possibility to
971971decide which algorithm is the most accurate, which one is the most reliable
972and which one is the most robust.
972and which one is the most robust. For illustration purposes, two real cases
973are presented below.
973974
974
975\begin{table}
976\caption{Average, Median, Min and Max Fitness}
975\begin{sidewaystable} \caption{Mean, median, minimum and maximum fitness on the CEC 2008 benchmark in 10 dimensions}%{{{3
976\label{tab:cec2008-stats}
977977\begin{tabular}{l|c|c|cc|c|c|cc|c|c|cc}
978978\hline\hline
979 & \multicolumn{4}{c|}{DE} & \multicolumn{4}{c|}{CMAES} & \multicolumn{4}{c}{SA}\\
979 & \multicolumn{4}{c|}{DE} & \multicolumn{4}{c|}{CMA-ES} & \multicolumn{4}{c}{SA}\\
980980\hline
981 & Average & Median & Minimum & Maximum & Average & Median & Minimum & Maximum & Average & Median & Minimum & Maximum\\
981 & Mean & Median & Minimum & Maximum & Mean & Median & Minimum & Maximum & Mean & Median & Minimum & Maximum\\
982982\hline
983F1 & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 2.57e+04$ & $ 2.38e+04$ & $ 1.82e+04$ & $ 3.39e+04$\\
984F2 & $ 1.34e-03$ & $ 1.17e-03$ & $ 2.80e-04$ & $ 4.48e-03$ & $ 5.87e-14$ & $ 5.68e-14$ & $ 5.68e-14$ & $ 1.14e-13$ & $ 6.87e+01$ & $ 7.01e+01$ & $ 5.26e+01$ & $ 7.61e+01$\\
985F3 & $ 2.22e+00$ & $ 2.08e+00$ & $ 1.05e+00$ & $ 3.64e+00$ & $ 2.66e-01$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$ & $ 3.73e+09$ & $ 3.54e+09$ & $ 1.83e+09$ & $ 5.75e+09$\\
986F4 & $ 7.57e+01$ & $ 7.83e+01$ & $ 4.78e+01$ & $ 9.70e+01$ & $ 2.83e+02$ & $ 2.58e+02$ & $ 2.69e+01$ & $ 5.52e+02$ & $ 2.25e+02$ & $ 2.28e+02$ & $ 1.76e+02$ & $ 2.49e+02$\\
987F5 & $ 5.94e-03$ & $ 1.69e-12$ & $ 0.00e+00$ & $ 3.20e-02$ & $ 1.97e-03$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 1.23e-02$ & $ 2.27e+02$ & $ 2.30e+02$ & $ 1.77e+02$ & $ 2.92e+02$\\
988F6 & $ 3.07e-13$ & $ 2.84e-13$ & $ 8.53e-14$ & $ 7.11e-13$ & $ 1.99e+01$ & $ 1.99e+01$ & $ 1.94e+01$ & $ 2.00e+01$ & $ 1.95e+01$ & $ 1.95e+01$ & $ 1.82e+01$ & $ 2.00e+01$\\
983$f1$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 2.57e+04$ & $ 2.38e+04$ & $ 1.82e+04$ & $ 3.39e+04$\\
984$f2$ & $ 1.34e-03$ & $ 1.17e-03$ & $ 2.80e-04$ & $ 4.48e-03$ & $ 5.87e-14$ & $ 5.68e-14$ & $ 5.68e-14$ & $ 1.14e-13$ & $ 6.87e+01$ & $ 7.01e+01$ & $ 5.26e+01$ & $ 7.61e+01$\\
985$f3$ & $ 2.22e+00$ & $ 2.08e+00$ & $ 1.05e+00$ & $ 3.64e+00$ & $ 2.66e-01$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$ & $ 3.73e+09$ & $ 3.54e+09$ & $ 1.83e+09$ & $ 5.75e+09$\\
986$f4$ & $ 7.57e+01$ & $ 7.83e+01$ & $ 4.78e+01$ & $ 9.70e+01$ & $ 2.83e+02$ & $ 2.58e+02$ & $ 2.69e+01$ & $ 5.52e+02$ & $ 2.25e+02$ & $ 2.28e+02$ & $ 1.76e+02$ & $ 2.49e+02$\\
987$f5$ & $ 5.94e-03$ & $ 1.69e-12$ & $ 0.00e+00$ & $ 3.20e-02$ & $ 1.97e-03$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 1.23e-02$ & $ 2.27e+02$ & $ 2.30e+02$ & $ 1.77e+02$ & $ 2.92e+02$\\
988$f6$ & $ 3.07e-13$ & $ 2.84e-13$ & $ 8.53e-14$ & $ 7.11e-13$ & $ 1.99e+01$ & $ 1.99e+01$ & $ 1.94e+01$ & $ 2.00e+01$ & $ 1.95e+01$ & $ 1.95e+01$ & $ 1.82e+01$ & $ 2.00e+01$\\
989989\hline\hline
990990\end{tabular}
991\end{table}
991\end{sidewaystable}%}}}3
992992
993Table~\ref{tab:cec2008-stats} presents the mean,
994median, minimum and maximum fitness values obtained by three algorithms on the
995six first functions of the CEC 2008 benchmark \cite{bib:cec2008}, in 10
996dimensions: the Differential Evolution (DE) \cite{bib:DEbook}, the Evolution
997Strategy with Covariance Matrix Adaptation (CMA-ES) \cite{bib:Hansen2003} and
998the Simulated Annealing \cite{Kirkpatrick1983}. Measures of the accuracy and
999robustness of the three algorithms can be found in that table as minimum and
1000maximum values of the sample produced by each algorithm for each of the six
1001functions, as defined in Section~\ref{sec:perspectives}.
9931002
994\begin{table}
995\caption{Probability of obtaining an excellent Fitness}
1003\begin{table} \caption{Reliability on the CEC 2008 benchmark}%{{{3
1004\label{tab:cec2008-reliability}
1005\subtable[Reliability as an empirical probability to produce a result lower than \textit{threshold}]{\label{tab:cec2008-reliability-prob}
9961006\begin{tabular}{l|c|c|c|c}
9971007\hline\hline
998 & \emph{threshold} & DE & CMAES & SA\\
1008 & \emph{threshold} & DE & CMA-ES & SA\\
9991009\hline
1000F1 & 0.00e+00 & \textbf{1.00e+00} & \textbf{1.00e+00} & 0.00e+00\\
1001F2 & 5.68e-14 & 0.00e+00 & \textbf{9.66e-01} & 0.00e+00\\
1002F3 & 0.00e+00 & 0.00e+00 & \textbf{9.33e-01} & 0.00e+00\\
1003F4 & 6.92e+01 & \textbf{2.36e-01} & 6.60e-02 & 0.00e+00\\
1004F5 & 0.00e+00 & 3.96e-01 & \textbf{7.69e-01} & 0.00e+00\\
1005F6 & 1.71e-13 & \textbf{3.66e-01} & 0.00e+00 & 0.00e+00\\
1010$f1$ & 0.00e+00 & \textbf{1.00e+00} & \textbf{1.00e+00} & 0.00e+00\\
1011$f2$ & 5.68e-14 & 0.00e+00 & \textbf{9.66e-01} & 0.00e+00\\
1012$f3$ & 0.00e+00 & 0.00e+00 & \textbf{9.33e-01} & 0.00e+00\\
1013$f4$ & 6.92e+01 & \textbf{2.36e-01} & 6.60e-02 & 0.00e+00\\
1014$f5$ & 0.00e+00 & 3.96e-01 & \textbf{7.69e-01} & 0.00e+00\\
1015$f6$ & 1.71e-13 & \textbf{3.66e-01} & 0.00e+00 & 0.00e+00\\
10061016\hline\hline
10071017\end{tabular}
1008\end{table}
1018}
10091019
1010
1011\begin{table}
1012\caption{Number of repeats needed to obtain an excellent Fitness with a 0.95 probability}
1020\subtable[Number of repeats needed to obtain a result lower than \textit{threshold} with a .95 confidence]{\label{tab:cec2008-reliability-nrp}
10131021\begin{tabular}{l|c|c|c|c}
10141022\hline\hline
1015 & \emph{threshold} & DE & CMAES & SA\\
1023 & \emph{threshold} & DE & CMA-ES & SA\\
10161024\hline
1017F1 & 0.00e+00 & \textbf{ 1.0} & \textbf{ 1.0} & $\infty$\\
1018F2 & 5.68e-14 & $\infty$ & \textbf{ 0.9} & $\infty$\\
1019F3 & 0.00e+00 & $\infty$ & \textbf{ 1.1} & $\infty$\\
1020F4 & 6.92e+01 & \textbf{ 11.1} & 43.9 & $\infty$\\
1021F5 & 0.00e+00 & 5.9 & \textbf{ 2.0} & $\infty$\\
1022F6 & 1.71e-13 & \textbf{ 6.6} & $\infty$ & $\infty$\\
1025$f1$ & 0.00e+00 & \textbf{ 1.0} & \textbf{ 1.0} & $\infty$\\
1026$f2$ & 5.68e-14 & $\infty$ & \textbf{ 0.9} & $\infty$\\
1027$f3$ & 0.00e+00 & $\infty$ & \textbf{ 1.1} & $\infty$\\
1028$f4$ & 6.92e+01 & \textbf{ 11.1} & 43.9 & $\infty$\\
1029$f5$ & 0.00e+00 & 5.9 & \textbf{ 2.0} & $\infty$\\
1030$f6$ & 1.71e-13 & \textbf{ 6.6} & $\infty$ & $\infty$\\
10231031\hline\hline
10241032\end{tabular}
1025\end{table}
1033}
1034\end{table}%}}}3
10261035
1036Table~\ref{tab:cec2008-reliability} presents a measure of the
1037reliability of each function, both as an empirical probability to produce a
1038result below the given threshold (see Table~\ref{tab:cec2008-reliability-prob})
1039and as the number of the times the algorithm needs to be repeatedly running to
1040obtain at least once, with a confidence of 0.95, a result below the given
1041threshold (see Table~\ref{tab:cec2008-reliability-nrp}). In these tables, the
1042algorithm with the best reliability (i.e., highest probability or lowest
1043number of repeats) is highlighted in bold font. The figures in those tables
1044clearly indicated that SA is not as reliable as the other two algorithms,
1045since it has a probability of 0 to reach the given threshold, which translates
1046as an infinite number of repeats (note that the infinite symbol is only a
1047convention, and that with a very large number of repeats, SA would be able to
1048find the function's optimum). We can also see from these tables that DE is
1049unable to reach the threshold in two cases (functions $f2$ and $f3$) while
1050CMA-ES is unable to do so in one case (function $f6$).
10271051
1028\begin{table}
1029\caption{Holm test on the Fitness (reference = DE, estimator = avg)}
1052\begin{table}\caption{Holm test on the Friedman statistic for the CEC 2008 benchmark}\label{tab:cec2008-holm}%{{{3
1053
1054\subtable[Reference algorithm is DE, ranking by mean]{\label{tab:cec2008-holm-avg}%{{{4
10301055\begin{tabular}{c|c|c|c|c|c}
10311056\hline\hline
10321057$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
10331058\hline
10341059 2 & SA & -1.88e+00 & 3.03e-02 & 2.50e-02 & Accepted\\
1035 1 & CMAES & -2.89e-01 & 3.86e-01 & 5.00e-02 & Accepted\\
1060 1 & CMA-ES & -2.89e-01 & 3.86e-01 & 5.00e-02 & Accepted\\
10361061\hline\hline
1037\end{tabular}
1038\end{table}
1062\end{tabular}}%}}}4
10391063
1040
1041\begin{table}
1042\caption{Holm test on the Fitness (reference = DE, estimator = med)}
1064\subtable[Reference algorithm is DE, ranking by median]{\label{tab:cec2008-holm-med}%{{{4
10431065\begin{tabular}{c|c|c|c|c|c}
10441066\hline\hline
10451067$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
10461068\hline
10471069 2 & SA & -1.88e+00 & 3.03e-02 & 2.50e-02 & Accepted\\
1048 1 & CMAES & -2.89e-01 & 3.86e-01 & 5.00e-02 & Accepted\\
1070 1 & CMA-ES & -2.89e-01 & 3.86e-01 & 5.00e-02 & Accepted\\
10491071\hline\hline
1050\end{tabular}
1051\end{table}
1072\end{tabular}}%}}}4
10521073
1053
1054\begin{table}
1055\caption{Holm test on the Fitness (reference = CMAES, estimator = min)}
1074\subtable[Reference algorithm is CMA-ES, ranking by accuracy]{\label{tab:cec2008-holm-acc}%{{{4
10561075\begin{tabular}{c|c|c|c|c|c}
10571076\hline\hline
10581077$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
10791079 2 & SA & -2.31e+00 & 1.05e-02 & 2.50e-02 & Rejected\\
10801080 1 & DE & -2.89e-01 & 3.86e-01 & 5.00e-02 & Accepted\\
10811081\hline\hline
1082\end{tabular}
1083\end{table}
1082\end{tabular}}%}}}4
10841083
1085
1086\begin{table}
1087\caption{Holm test on the Fitness (reference = CMAES, estimator = nrp)}
1084\subtable[Reference algorithm is CMA-ES, ranking by reliability]{\label{tab:cec2008-holm-rel}%{{{4
10881085\begin{tabular}{c|c|c|c|c|c}
10891086\hline\hline
10901087$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
10891089 2 & SA & -2.17e+00 & 1.52e-02 & 2.50e-02 & Rejected\\
10901090 1 & DE & -4.33e-01 & 3.33e-01 & 5.00e-02 & Accepted\\
10911091\hline\hline
1092\end{tabular}
1093\end{table}
1092\end{tabular}}%}}}4
10941093
1095
1096\begin{table}
1097\caption{Holm test on the Fitness (reference = DE, estimator = max)}
1094\subtable[Reference algorithm is DE, ranking by robustness]{\label{tab:cec2008-holm-rob}%{{{4
10981095\begin{tabular}{c|c|c|c|c|c}
10991096\hline\hline
11001097$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
11011098\hline
11021099 2 & SA & -2.45e+00 & 7.07e-03 & 2.50e-02 & Rejected\\
1103 1 & CMAES & -5.77e-01 & 2.82e-01 & 5.00e-02 & Accepted\\
1100 1 & CMA-ES & -5.77e-01 & 2.82e-01 & 5.00e-02 & Accepted\\
11041101\hline\hline
1102\end{tabular}}%}}}4
1103\end{table}%}}}3
1104
1105Finally, Table~\ref{tab:cec2008-holm} presents the results of the Holm
1106procedure applied to the Friedman test, as described in \cite{Derrac2011}.
1107Using the sample mean (Table~\ref{tab:cec2008-holm-avg}) or median
1108(Table~\ref{tab:cec2008-holm-med}) as a measure of performance for ranking the
1109algorithm leads the test to consider all three algorithms to have similar
1110performances: the null hypothesis of no difference cannot be rejected when
1111comparing DE to CMA-ES and DE to SA. When the ranking is based on accuracy,
1112reliability or robustness however, the test indicates that DE and CMA-ES have
1113similar performance, but that SA performs significantly worse than CMA-ES or
1114DE. These results correspond more to the qualitative estimation of the
1115algorithms performance that one can make when looking at
1116Tables~\ref{tab:cec2008-stats} and~\ref{tab:cec2008-reliability}: the mean and
1117median values of the results produced by SA are greater than the ones produced
1118by the other two algorithms (with the exception of $f4$ and $f6$
1119where CMA-ES is worse than SA) and the values of reliability clearly indicate
1120SA not to be competitive with DE and CMA-ES.
1121
1122
1123\begin{table}\caption{Mean, median, minimum and maximum fitness on the BBOB 2010 benchmark in 10 dimensions}\label{tab:bbob2010-stats}%{{{3}
1124\begin{tabular}{l|c|c|cc|c|c|cc}%|c|c|cc|c|c|cc}
1125\hline\hline
1126 & \multicolumn{4}{c|}{S3SOME} & \multicolumn{4}{c|}{3SOME}\\% & \multicolumn{4}{c|}{SADE} & \multicolumn{4}{c}{(1+1) CMA-ES}\\
1127\hline
1128 & Mean & Median & Minimum & Maximum & Mean & Median & Minimum & Maximum\\% & Mean & Median & Minimum & Maximum & Mean & Median & Minimum & Maximum\\
1129\hline
1130$ f1$ & $ 2.37e-14$ & $ 2.84e-14$ & $ 1.42e-14$ & $ 5.68e-14$ & $ 2.61e-14$ & $ 2.84e-14$ & $ 1.42e-14$ & $ 5.68e-14$\\% & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1131$ f2$ & $ 2.08e-14$ & $ 2.84e-14$ & $ 0.00e+00$ & $ 5.68e-14$ & $ 2.84e-14$ & $ 2.84e-14$ & $ 0.00e+00$ & $ 8.53e-14$\\% & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1132$ f3$ & $ 9.29e-01$ & $ 9.95e-01$ & $ 0.00e+00$ & $ 2.83e+00$ & $ 1.26e+00$ & $ 9.95e-01$ & $ 5.68e-14$ & $ 2.98e+00$\\% & $ 2.16e+00$ & $ 1.99e+00$ & $ 0.00e+00$ & $ 5.97e+00$ & $ 6.94e+01$ & $ 5.77e+01$ & $ 1.99e+01$ & $ 1.46e+02$\\
1133$ f4$ & $ 1.16e+00$ & $ 9.95e-01$ & $ 5.68e-14$ & $ 3.98e+00$ & $ 1.53e+00$ & $ 9.95e-01$ & $ 0.00e+00$ & $ 4.97e+00$\\% & $ 4.28e+00$ & $ 3.98e+00$ & $ 9.95e-01$ & $ 8.95e+00$ & $ 9.90e+01$ & $ 8.01e+01$ & $ 1.59e+01$ & $ 2.37e+02$\\
1134$ f5$ & $ 1.09e-13$ & $ 1.07e-13$ & $ 2.13e-14$ & $ 2.77e-13$ & $ 1.72e+01$ & $ 3.55e-14$ & $ 0.00e+00$ & $ 8.55e+01$\\% & $ 8.57e-14$ & $ 6.39e-14$ & $ 0.00e+00$ & $ 2.77e-13$ & $ 8.11e-01$ & $ 1.97e-01$ & $ 3.33e-04$ & $ 5.93e+00$\\
1135$ f6$ & $ 1.68e-03$ & $ 7.21e-06$ & $ 3.79e-11$ & $ 3.24e-02$ & $ 3.70e-04$ & $ 1.08e-05$ & $ 7.53e-13$ & $ 4.96e-03$\\% & $ 1.89e-14$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 1.28e-13$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1136$ f7$ & $ 7.33e+00$ & $ 6.81e+00$ & $ 1.11e+00$ & $ 1.61e+01$ & $ 1.11e+01$ & $ 7.81e+00$ & $ 1.86e+00$ & $ 3.54e+01$\\% & $ 2.14e-01$ & $ 6.60e-02$ & $ 0.00e+00$ & $ 7.33e-01$ & $ 8.28e-02$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.37e-01$\\
1137$ f8$ & $ 1.03e-01$ & $ 1.12e-03$ & $ 6.45e-06$ & $ 8.63e-01$ & $ 7.15e-02$ & $ 9.76e-04$ & $ 2.90e-04$ & $ 7.72e-01$\\% & $ 7.41e-10$ & $ 8.53e-14$ & $ 0.00e+00$ & $ 1.73e-08$ & $ 1.20e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$\\
1138$ f9$ & $ 2.66e-01$ & $ 3.27e-04$ & $ 2.03e-04$ & $ 3.99e+00$ & $ 2.91e+00$ & $ 2.44e-04$ & $ 1.55e-04$ & $ 7.13e+01$\\% & $ 6.68e-04$ & $ 2.44e-07$ & $ 2.84e-14$ & $ 7.03e-03$ & $ 7.97e-01$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$\\
1139$f10$ & $ 3.32e+02$ & $ 3.33e+02$ & $ 1.22e+01$ & $ 9.11e+02$ & $ 3.04e+02$ & $ 3.21e+02$ & $ 4.04e+01$ & $ 5.45e+02$\\% & $ 3.15e+01$ & $ 1.24e+01$ & $ 5.91e-01$ & $ 1.72e+02$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1140$f11$ & $ 7.50e+01$ & $ 7.31e+01$ & $ 1.05e+01$ & $ 1.55e+02$ & $ 7.87e+01$ & $ 7.21e+01$ & $ 1.47e+01$ & $ 1.64e+02$\\% & $ 1.62e+00$ & $ 9.56e-01$ & $ 1.60e-02$ & $ 6.04e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1141$f12$ & $ 1.34e+01$ & $ 4.21e+00$ & $ 3.04e-04$ & $ 7.13e+01$ & $ 1.27e+01$ & $ 2.43e+00$ & $ 5.05e-04$ & $ 6.95e+01$\\% & $ 4.92e-01$ & $ 1.42e-01$ & $ 1.76e-03$ & $ 3.11e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1142$f13$ & $ 1.45e+01$ & $ 1.31e+01$ & $ 7.49e-03$ & $ 3.63e+01$ & $ 1.16e+01$ & $ 6.71e+00$ & $ 1.07e-03$ & $ 3.49e+01$\\% & $ 1.12e-02$ & $ 2.53e-03$ & $ 5.90e-05$ & $ 7.97e-02$ & $ 1.08e+01$ & $ 7.06e+00$ & $ 1.06e-02$ & $ 3.40e+01$\\
1143$f14$ & $ 1.04e-04$ & $ 1.10e-04$ & $ 6.14e-05$ & $ 1.51e-04$ & $ 9.92e-05$ & $ 9.96e-05$ & $ 3.38e-05$ & $ 1.50e-04$\\% & $ 1.12e-05$ & $ 6.94e-06$ & $ 4.17e-07$ & $ 4.57e-05$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1144$f15$ & $ 5.57e+01$ & $ 5.22e+01$ & $ 1.99e+01$ & $ 1.11e+02$ & $ 1.21e+02$ & $ 1.13e+02$ & $ 1.79e+01$ & $ 2.79e+02$\\% & $ 9.22e+00$ & $ 8.46e+00$ & $ 2.98e+00$ & $ 1.79e+01$ & $ 7.44e+01$ & $ 8.16e+01$ & $ 1.29e+01$ & $ 1.77e+02$\\
1145$f16$ & $ 3.94e+00$ & $ 3.11e+00$ & $ 1.21e+00$ & $ 1.07e+01$ & $ 6.61e+00$ & $ 5.45e+00$ & $ 9.15e-01$ & $ 1.87e+01$\\% & $ 8.39e-01$ & $ 6.06e-01$ & $ 4.72e-02$ & $ 3.01e+00$ & $ 8.30e+00$ & $ 6.63e+00$ & $ 1.43e+00$ & $ 2.06e+01$\\
1146$f17$ & $ 2.04e+00$ & $ 2.17e+00$ & $ 3.87e-01$ & $ 4.62e+00$ & $ 7.71e+00$ & $ 5.06e+00$ & $ 5.44e-01$ & $ 3.88e+01$\\% & $ 1.96e-03$ & $ 4.99e-04$ & $ 7.58e-08$ & $ 2.38e-02$ & $ 5.88e+00$ & $ 5.21e+00$ & $ 4.99e-01$ & $ 1.66e+01$\\
1147$f18$ & $ 1.00e+01$ & $ 8.67e+00$ & $ 7.49e-01$ & $ 5.26e+01$ & $ 2.53e+01$ & $ 1.91e+01$ & $ 1.31e+00$ & $ 8.61e+01$\\% & $ 3.13e-02$ & $ 1.62e-02$ & $ 7.66e-04$ & $ 3.56e-01$ & $ 2.36e+01$ & $ 2.12e+01$ & $ 1.55e+00$ & $ 5.28e+01$\\
1148$f19$ & $ 2.36e+00$ & $ 1.93e+00$ & $ 3.66e-01$ & $ 6.78e+00$ & $ 4.56e+00$ & $ 3.92e+00$ & $ 3.42e-01$ & $ 1.21e+01$\\% & $ 5.66e-01$ & $ 3.44e-01$ & $ 8.55e-02$ & $ 2.67e+00$ & $ 2.87e+00$ & $ 2.38e+00$ & $ 1.51e-01$ & $ 7.24e+00$\\
1149$f20$ & $ 6.12e-01$ & $ 5.63e-01$ & $ 1.18e-01$ & $ 1.28e+00$ & $ 8.50e-01$ & $ 8.28e-01$ & $ 2.37e-01$ & $ 1.52e+00$\\% & $ 8.11e-01$ & $ 7.90e-01$ & $ 2.17e-01$ & $ 1.38e+00$ & $ 1.61e+00$ & $ 1.63e+00$ & $ 9.08e-01$ & $ 2.27e+00$\\
1150$f21$ & $ 4.37e+00$ & $ 2.78e+00$ & $ 0.00e+00$ & $ 1.71e+01$ & $ 6.21e+00$ & $ 2.82e+00$ & $ 6.92e-01$ & $ 4.75e+01$\\% & $ 1.06e+00$ & $ 1.25e+00$ & $ 0.00e+00$ & $ 2.26e+00$ & $ 7.14e+00$ & $ 5.78e+00$ & $ 0.00e+00$ & $ 2.79e+01$\\
1151$f22$ & $ 2.46e+00$ & $ 1.95e+00$ & $ 2.18e-11$ & $ 1.46e+01$ & $ 1.22e+01$ & $ 2.59e+00$ & $ 2.26e-11$ & $ 5.68e+01$\\% & $ 1.95e+00$ & $ 1.95e+00$ & $ 1.95e+00$ & $ 1.95e+00$ & $ 1.36e+01$ & $ 2.59e+00$ & $ 0.00e+00$ & $ 6.22e+01$\\
1152$f23$ & $ 7.36e-01$ & $ 6.91e-01$ & $ 2.20e-01$ & $ 1.30e+00$ & $ 9.75e-01$ & $ 9.18e-01$ & $ 1.88e-01$ & $ 2.07e+00$\\% & $ 4.42e-01$ & $ 1.88e-01$ & $ 1.87e-02$ & $ 1.83e+00$ & $ 1.23e+00$ & $ 1.12e+00$ & $ 2.47e-01$ & $ 3.09e+00$\\
1153$f24$ & $ 5.46e+01$ & $ 5.03e+01$ & $ 2.34e+01$ & $ 9.77e+01$ & $ 9.73e+01$ & $ 9.48e+01$ & $ 2.41e+01$ & $ 1.75e+02$\\% & $ 1.97e+01$ & $ 1.95e+01$ & $ 1.26e+01$ & $ 3.04e+01$ & $ 7.36e+01$ & $ 6.07e+01$ & $ 9.21e+00$ & $ 1.81e+02$\\
1154\hline\hline
11051155\end{tabular}
1106\end{table}
1156\bigbreak
1157\begin{tabular}{l|c|c|cc|c|c|cc}%|c|c|cc|c|c|cc}
1158\hline\hline
1159 & \multicolumn{4}{c|}{SADE} & \multicolumn{4}{c}{(1+1) CMA-ES}\\
1160\hline
1161 & Mean & Median & Minimum & Maximum & Mean & Median & Minimum & Maximum\\
1162\hline
1163$ f1$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1164$ f2$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1165$ f3$ & $ 2.16e+00$ & $ 1.99e+00$ & $ 0.00e+00$ & $ 5.97e+00$ & $ 6.94e+01$ & $ 5.77e+01$ & $ 1.99e+01$ & $ 1.46e+02$\\
1166$ f4$ & $ 4.28e+00$ & $ 3.98e+00$ & $ 9.95e-01$ & $ 8.95e+00$ & $ 9.90e+01$ & $ 8.01e+01$ & $ 1.59e+01$ & $ 2.37e+02$\\
1167$ f5$ & $ 8.57e-14$ & $ 6.39e-14$ & $ 0.00e+00$ & $ 2.77e-13$ & $ 8.11e-01$ & $ 1.97e-01$ & $ 3.33e-04$ & $ 5.93e+00$\\
1168$ f6$ & $ 1.89e-14$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 1.28e-13$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1169$ f7$ & $ 2.14e-01$ & $ 6.60e-02$ & $ 0.00e+00$ & $ 7.33e-01$ & $ 8.28e-02$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.37e-01$\\
1170$ f8$ & $ 7.41e-10$ & $ 8.53e-14$ & $ 0.00e+00$ & $ 1.73e-08$ & $ 1.20e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$\\
1171$ f9$ & $ 6.68e-04$ & $ 2.44e-07$ & $ 2.84e-14$ & $ 7.03e-03$ & $ 7.97e-01$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 3.99e+00$\\
1172$f10$ & $ 3.15e+01$ & $ 1.24e+01$ & $ 5.91e-01$ & $ 1.72e+02$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1173$f11$ & $ 1.62e+00$ & $ 9.56e-01$ & $ 1.60e-02$ & $ 6.04e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1174$f12$ & $ 4.92e-01$ & $ 1.42e-01$ & $ 1.76e-03$ & $ 3.11e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1175$f13$ & $ 1.12e-02$ & $ 2.53e-03$ & $ 5.90e-05$ & $ 7.97e-02$ & $ 1.08e+01$ & $ 7.06e+00$ & $ 1.06e-02$ & $ 3.40e+01$\\
1176$f14$ & $ 1.12e-05$ & $ 6.94e-06$ & $ 4.17e-07$ & $ 4.57e-05$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$ & $ 0.00e+00$\\
1177$f15$ & $ 9.22e+00$ & $ 8.46e+00$ & $ 2.98e+00$ & $ 1.79e+01$ & $ 7.44e+01$ & $ 8.16e+01$ & $ 1.29e+01$ & $ 1.77e+02$\\
1178$f16$ & $ 8.39e-01$ & $ 6.06e-01$ & $ 4.72e-02$ & $ 3.01e+00$ & $ 8.30e+00$ & $ 6.63e+00$ & $ 1.43e+00$ & $ 2.06e+01$\\
1179$f17$ & $ 1.96e-03$ & $ 4.99e-04$ & $ 7.58e-08$ & $ 2.38e-02$ & $ 5.88e+00$ & $ 5.21e+00$ & $ 4.99e-01$ & $ 1.66e+01$\\
1180$f18$ & $ 3.13e-02$ & $ 1.62e-02$ & $ 7.66e-04$ & $ 3.56e-01$ & $ 2.36e+01$ & $ 2.12e+01$ & $ 1.55e+00$ & $ 5.28e+01$\\
1181$f19$ & $ 5.66e-01$ & $ 3.44e-01$ & $ 8.55e-02$ & $ 2.67e+00$ & $ 2.87e+00$ & $ 2.38e+00$ & $ 1.51e-01$ & $ 7.24e+00$\\
1182$f20$ & $ 8.11e-01$ & $ 7.90e-01$ & $ 2.17e-01$ & $ 1.38e+00$ & $ 1.61e+00$ & $ 1.63e+00$ & $ 9.08e-01$ & $ 2.27e+00$\\
1183$f21$ & $ 1.06e+00$ & $ 1.25e+00$ & $ 0.00e+00$ & $ 2.26e+00$ & $ 7.14e+00$ & $ 5.78e+00$ & $ 0.00e+00$ & $ 2.79e+01$\\
1184$f22$ & $ 1.95e+00$ & $ 1.95e+00$ & $ 1.95e+00$ & $ 1.95e+00$ & $ 1.36e+01$ & $ 2.59e+00$ & $ 0.00e+00$ & $ 6.22e+01$\\
1185$f23$ & $ 4.42e-01$ & $ 1.88e-01$ & $ 1.87e-02$ & $ 1.83e+00$ & $ 1.23e+00$ & $ 1.12e+00$ & $ 2.47e-01$ & $ 3.09e+00$\\
1186$f24$ & $ 1.97e+01$ & $ 1.95e+01$ & $ 1.26e+01$ & $ 3.04e+01$ & $ 7.36e+01$ & $ 6.07e+01$ & $ 9.21e+00$ & $ 1.81e+02$\\
1187\hline\hline
1188\end{tabular}
1189\end{table}%}}}3
11071190
1108%\cite{Tirronen2010}
1191The second case concerns the comparison of the Three-Stage Optimal Memetic
1192Exploration (3SOME) \cite{bib:Iacca2012}, the Shrinking 3SOME (S3SOME)
1193\cite{fixme}, the Self-Adaptive Differential Evolution (SADE)\cite{bib:Qin2009} and
1194the (1+1)-Covariance Matrix Adaptation for Evolution Strategies ((1+1) CMA-ES)
1195\cite{bib:Igel2006} on the Blackbox optimization benchmark (BBOB) 2010
1196\cite{bib:BBOB2010} in 10 dimensions. Table~\ref{tab:bbob2010-stats} presents
1197the mean, median, minimum and maximum values produced by the four algorithms
1198above on the said benchmark.
1199
1200\begin{table} \caption{Reliability on the BBOB 2010 benchmark}%{{{3
1201\label{tab:bbob2010-reliability}
1202\subtable[Reliability as an empirical probability to produce a result lower than \textit{threshold}]{\label{tab:bbob2010-reliability-prob}
1203\begin{tabular}{l|c|c|c|c|c}
1204\hline\hline
1205 & \emph{threshold} & S3SOME & 3SOME & SADE & (1+1) CMA-ES\\
1206\hline
1207$ f1$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & \textbf{1.00e+00} & \textbf{1.00e+00}\\
1208$ f2$ & 0.00e+00 & 3.30e-01 & 1.35e-01 & \textbf{1.00e+00} & \textbf{1.00e+00}\\
1209$ f3$ & 5.68e-14 & \textbf{3.35e-01} & 1.01e-01 & 2.34e-01 & 0.00e+00\\
1210$ f4$ & 8.53e-14 & \textbf{2.01e-01} & 2.01e-01 & 0.00e+00 & 0.00e+00\\
1211$ f5$ & 0.00e+00 & 0.00e+00 & \textbf{3.28e-01} & 2.36e-01 & 0.00e+00\\
1212$ f6$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 5.68e-01 & \textbf{1.00e+00}\\
1213$ f7$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 1.34e-01 & \textbf{7.35e-01}\\
1214$ f8$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 1.30e-01 & \textbf{7.03e-01}\\
1215$ f9$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 0.00e+00 & \textbf{8.03e-01}\\
1216$f10$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 0.00e+00 & \textbf{1.00e+00}\\
1217$f11$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 0.00e+00 & \textbf{1.00e+00}\\
1218$f12$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 0.00e+00 & \textbf{1.00e+00}\\
1219$f13$ & 1.66e-03 & 0.00e+00 & 3.31e-02 & \textbf{3.71e-01} & 0.00e+00\\
1220$f14$ & 0.00e+00 & 0.00e+00 & 0.00e+00 & 0.00e+00 & \textbf{1.00e+00}\\
1221$f15$ & 7.96e+00 & 0.00e+00 & 0.00e+00 & \textbf{4.03e-01} & 0.00e+00\\
1222$f16$ & 3.79e-01 & 0.00e+00 & 0.00e+00 & \textbf{3.91e-01} & 0.00e+00\\
1223$f17$ & 3.15e-04 & 0.00e+00 & 0.00e+00 & \textbf{4.02e-01} & 0.00e+00\\
1224$f18$ & 1.23e-02 & 0.00e+00 & 0.00e+00 & \textbf{4.03e-01} & 0.00e+00\\
1225$f19$ & 3.01e-01 & 0.00e+00 & 0.00e+00 & \textbf{3.62e-01} & 3.31e-02\\
1226$f20$ & 4.54e-01 & \textbf{2.00e-01} & 3.48e-02 & 1.71e-01 & 0.00e+00\\
1227$f21$ & 2.49e-14 & 1.00e-01 & 0.00e+00 & \textbf{2.69e-01} & 3.30e-02\\
1228$f22$ & 8.84e-11 & \textbf{2.05e-01} & 1.33e-01 & 0.00e+00 & 6.66e-02\\
1229$f23$ & 1.86e-01 & 0.00e+00 & 0.00e+00 & \textbf{4.00e-01} & 0.00e+00\\
1230$f24$ & 1.70e+01 & 0.00e+00 & 0.00e+00 & \textbf{3.67e-01} & 3.49e-02\\
1231\hline\hline
1232\end{tabular}}
1233
1234\subtable[Number of repeats needed to obtain a result lower than \textit{threshold} with a .95 confidence]{\label{tab:bbob2010-reliability-nrp}
1235\begin{tabular}{l|c|c|c|c|c}
1236\hline\hline
1237 & \emph{threshold} & S3SOME & 3SOME & SADE & (1+1) CMA-ES\\
1238\hline
1239$ f1$ & 0.00e+00 & $\infty$ & $\infty$ & \textbf{ 1.0} & \textbf{ 1.0}\\
1240$ f2$ & 0.00e+00 & 7.5 & 21.2 & \textbf{ 1.0} & \textbf{ 1.0}\\
1241$ f3$ & 5.68e-14 & \textbf{ 7.4} & 28.7 & 11.4 & $\infty$\\
1242$ f4$ & 8.53e-14 & \textbf{ 13.1} & 13.5 & $\infty$ & $\infty$\\
1243$ f5$ & 0.00e+00 & $\infty$ & \textbf{ 7.4} & 11.3 & $\infty$\\
1244$ f6$ & 0.00e+00 & $\infty$ & $\infty$ & 3.6 & \textbf{ 1.0}\\
1245$ f7$ & 0.00e+00 & $\infty$ & $\infty$ & 21.7 & \textbf{ 2.3}\\
1246$ f8$ & 0.00e+00 & $\infty$ & $\infty$ & 20.8 & \textbf{ 2.5}\\
1247$ f9$ & 0.00e+00 & $\infty$ & $\infty$ & $\infty$ & \textbf{ 1.9}\\
1248$f10$ & 0.00e+00 & $\infty$ & $\infty$ & $\infty$ & \textbf{ 1.0}\\
1249$f11$ & 0.00e+00 & $\infty$ & $\infty$ & $\infty$ & \textbf{ 1.0}\\
1250$f12$ & 0.00e+00 & $\infty$ & $\infty$ & $\infty$ & \textbf{ 1.0}\\
1251$f13$ & 1.66e-03 & $\infty$ & 87.7 & \textbf{ 6.6} & $\infty$\\
1252$f14$ & 0.00e+00 & $\infty$ & $\infty$ & $\infty$ & \textbf{ 1.0}\\
1253$f15$ & 7.96e+00 & $\infty$ & $\infty$ & \textbf{ 5.9} & $\infty$\\
1254$f16$ & 3.79e-01 & $\infty$ & $\infty$ & \textbf{ 5.9} & $\infty$\\
1255$f17$ & 3.15e-04 & $\infty$ & $\infty$ & \textbf{ 5.8} & $\infty$\\
1256$f18$ & 1.23e-02 & $\infty$ & $\infty$ & \textbf{ 5.8} & $\infty$\\
1257$f19$ & 3.01e-01 & $\infty$ & $\infty$ & \textbf{ 6.6} & 90.6\\
1258$f20$ & 4.54e-01 & \textbf{ 13.4} & 86.6 & 16.6 & $\infty$\\
1259$f21$ & 2.49e-14 & 28.5 & $\infty$ & \textbf{ 9.7} & 93.9\\
1260$f22$ & 8.84e-11 & \textbf{ 13.8} & 21.1 & $\infty$ & 42.9\\
1261$f23$ & 1.86e-01 & $\infty$ & $\infty$ & \textbf{ 5.9} & $\infty$\\
1262$f24$ & 1.70e+01 & $\infty$ & $\infty$ & \textbf{ 6.6} & 91.1\\
1263\hline\hline
1264\end{tabular}}
1265\end{table}%}}}3
1266
1267The reliability of the four algorithms is presented in
1268Table~\ref{tab:bbob2010-reliability}, both as an empirical probability of
1269reaching the given threshold and as the number of times the algorithm needs to
1270the run to produce at least once, with a .95 confidence, a result as good as
1271the threshold. As in the previous example, the algorithms that are considered
1272the most reliable, for each function, are highlighted in bold font. From these
1273tables, one can see that SADE and (1+1) CMA-ES seem more reliable than 3SOME
1274and S3SOME. There are many cases however where either SADE or (1+1) CMA-ES is
1275much more reliable than the other algorithm, the latter being unable to reach
1276the threshold.
1277
1278\begin{table}\caption{Holm test on the Friedman statistic for the BBOB 2010 benchmark}\label{tab:bbob2010-holm}%{{{3
1279\subtable[Reference algorithm is SADE, ranking by mean]{\label{tab:bbob2010-holm-avg}%{{{4
1280\begin{tabular}{c|c|c|c|c|c}
1281\hline\hline
1282$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
1283\hline
1284 3 & 3SOME & -4.92e+00 & 4.34e-07 & 1.67e-02 & Rejected\\
1285 2 & (1+1) CMA-ES & -3.13e+00 & 8.73e-04 & 2.50e-02 & Rejected\\
1286 1 & S3SOME & -2.68e+00 & 3.65e-03 & 5.00e-02 & Rejected\\
1287\hline\hline
1288\end{tabular}}%}}}4
1289
1290\subtable[Reference algorithm is SADE, ranking by median]{\label{tab:bbob2010-holm-med}%{{{4
1291\begin{tabular}{c|c|c|c|c|c}
1292\hline\hline
1293$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
1294\hline
1295 3 & 3SOME & -3.97e+00 & 3.61e-05 & 1.67e-02 & Rejected\\
1296 2 & S3SOME & -2.96e+00 & 1.52e-03 & 2.50e-02 & Rejected\\
1297 1 & (1+1) CMA-ES & -2.68e+00 & 3.65e-03 & 5.00e-02 & Rejected\\
1298\hline\hline
1299\end{tabular}}%}}}4
1300
1301\subtable[Reference algorithm is SADE, ranking by accuracy]{\label{tab:bbob2010-holm-acc}%{{{4
1302\begin{tabular}{c|c|c|c|c|c}
1303\hline\hline
1304$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
1305\hline
1306 3 & 3SOME & -3.24e+00 & 5.93e-04 & 1.67e-02 & Rejected\\
1307 2 & S3SOME & -2.68e+00 & 3.65e-03 & 2.50e-02 & Rejected\\
1308 1 & (1+1) CMA-ES & -1.45e+00 & 7.31e-02 & 5.00e-02 & Accepted\\
1309\hline\hline
1310\end{tabular}}%}}}4
1311
1312\subtable[Reference algorithm is SADE, ranking by reliability]{\label{tab:bbob2010-holm-rel}%{{{4
1313\begin{tabular}{c|c|c|c|c|c}
1314\hline\hline
1315$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
1316\hline
1317 3 & 3SOME & -2.85e+00 & 2.18e-03 & 1.67e-02 & Rejected\\
1318 2 & S3SOME & -2.29e+00 & 1.10e-02 & 2.50e-02 & Rejected\\
1319 1 & (1+1) CMA-ES & -8.94e-01 & 1.86e-01 & 5.00e-02 & Accepted\\
1320\hline\hline
1321\end{tabular}}%}}}4
1322
1323\subtable[Reference algorithm is SADE, ranking by robustness]{\label{tab:bbob2010-holm-rob}%{{{4
1324\begin{tabular}{c|c|c|c|c|c}
1325\hline\hline
1326$i$ & Optimizer & $z$ & $p$ & $\alpha/i$ & Hypothesis\\
1327\hline
1328 3 & 3SOME & -4.70e+00 & 1.33e-06 & 1.67e-02 & Rejected\\
1329 2 & (1+1) CMA-ES & -2.85e+00 & 2.18e-03 & 2.50e-02 & Rejected\\
1330 1 & S3SOME & -2.52e+00 & 5.94e-03 & 5.00e-02 & Rejected\\
1331\hline\hline
1332\end{tabular}}%}}}4
1333
1334\end{table}%}}}3
1335
1336Finally, Table~\ref{tab:bbob2010-holm} presents the results of the Holm
1337procedure applied to the Friedman test, as described in \cite{Derrac2011}.
1338Ranking the algorithms based on their average or media performances (see
1339Tables~\ref{tab:bbob2010-holm-avg} and~\ref{tab:bbob2010-holm-med}
1340respectively) indicates that SADE is the best performing algorithm over the
1341whole 24 functions of the benchmark, its performance being significantly
1342different from the one of all three other algorithms, and particularly from
1343(1+1) CMA-ES. When examining the performance of the algorithms in the
1344perspectives of accuracy, reliability and robustness however, one can see
1345different results. The tests show that the accuracy
1346(Table~\ref{tab:bbob2010-holm-acc}) and reliability
1347(Table~\ref{tab:bbob2010-holm-rel}) of SADE and (1+1) CMA-ES are
1348not significantly different. SADE is however more robust then any other
1349algorithm on this benchmark (see Table~\ref{tab:bbob2010-holm-rob}).
1350
1351\section{Conclusion}%{{{1
1352
1353In this article, the relevance of using the sample mean as the only estimate
1354of algorithmic performance in the fields of computational intelligence and
1355data mining has been discussed. Reasons behind the adoption of the sample mean
1356for measuring an algorithm's performance have been conjectured, and quantitative and
1357qualitative interpretation of the mean, standard deviation, median and
1358quantiles have been presented. It has been shown that, contrary to widely spread
1359belief, the results (best fitness, or smallest error) produced by such
1360algorithms do not follow a normal distribution. A formal counter-example based
1361on first-order statistic, using a trivial optimization algorithm on a trivial
1362test function, has shown that the probability distribution of the results are
1363indeed not normally distributed; this fact has additionally been illustrated
1364by examples from experimental results. It ensues that the statistical analysis of
1365algorithm results must therefore not be based on the assumption of normality;
1366this more particularly confirms the previous empirical finding that Student's
1367t-test must not be used for pairwise, single-function comparisons of
1368algorithms and also stresses that the sample mean and the associated standard
1369deviation cannot therefore be interpreted by assuming such a normal
1370distribution.
1371
1372Alternative measures of an algorithm's performance have been suggested,
1373allowing to consider the performance from different perspectives, depending on
1374the context in which the algorithm is being applied. These perspectives are
1375accuracy, reliability, robustness and efficiency. Accuracy measures
1376the algorithm's ability to get close to the problem's global optimum and gives
1377an estimate of its value. Reliability is an empirical probability for an
1378algorithm to produce solutions below a given threshold, the latter marking the
1379limit between a ``good'' solution and a ``bad'' one; based on this
1380probability, one can compute the number of times the algorithm needs to be
1381repeated to produce a ``good'' result with a given reliability. Robustness
1382measures the ``worst'' result the algorithm may achieve, while efficiency is
1383an estimation of the computing time needed for running the algorithm (not counting
1384the time required to perform a fitness evaluation). In the case an estimate of
1385the problem's optimum is necessary, algorithms may be compared on their
1386accuracy. In a context where an algorithm can be repeated multiple times and
1387the best solution can be selected (e.g., the optimization of an engineering
1388problem), comparing algorithms on their reliability allows to select the one
1389that requires the least repeats. In a context where the algorithm cannot be
1390run more than once (e.g, an online optimization problem), comparing algorithm
1391on their robustness allows to select the one producing the ``least bad''
1392solution. In a context where execution time matters (such as an embedded
1393system with low processing power), it may be beneficial to compare algorithms
1394based on their efficiency.
1395
1396Statistical tests for performing multiple-problem comparisons of algorithms
1397have already been published in the past, relying on ranking the algorithms
1398over each problem by their mean result. Two study cases have been presented in
1399this article where the ranking has been made based on accuracy, reliability
1400and robustness, and the results of the tests have been compared to those made
1401by ranking the algorithm based on the sample mean or the median. In both cases
1402a difference has been noticed, the tests based on the three above perspective
1403being closer to the qualitative estimation that can be made by looking at the
1404raw algorithmic results.
1405
11091406%}}}1
1407
11101408\bibliographystyle{ieeetr}
11111409\bibliography{biblio}
11121410\end{document}