%% bare_conf.tex
%% V1.3
%% 2007/01/11
%% by Michael Shell
%% See:
%% http://www.michaelshell.org/
%% for current contact information.
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper.
%%
%% Support sites:
%% http://www.michaelshell.org/tex/ieeetran/
%% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/
%% and
%% http://www.ieee.org/
%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE!
%% User assumes all risk.
%% In no event shall IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including **
%% ** renaming them and changing author support contact information. **
%%
%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
%% bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex
%%*************************************************************************
% *** Authors should verify (and, if needed, correct) their LaTeX system ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. IEEE's font choices can trigger bugs that do ***
% *** not appear when using other class files. ***
% The testflow support page is at:
% http://www.michaelshell.org/tex/testflow/
% Note that the a4paper option is mainly intended so that authors in
% countries using A4 can easily print to A4 and see how their papers will
% look in print - the typesetting of the document will not typically be
% affected with changes in paper size (but the bottom and side margins will).
% Use the testflow package mentioned above to verify correct handling of
% both paper sizes by the user's LaTeX system.
%
% Also note that the "draftcls" or "draftclsnofoot", not "draft", option
% should be used if it is desired that the figures are to be displayed in
% draft mode.
%
\documentclass[conference]{IEEEtran}
% Add the compsoc option for Computer Society conferences.
%
% If IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it like:
% \documentclass[conference]{../sty/IEEEtran}
% Some very useful LaTeX packages include:
% (uncomment the ones you want to load)
% *** MISC UTILITY PACKAGES ***
%
\usepackage{ifpdf}
% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
% compilation based on whether the output is pdf or dvi.
% usage:
% \ifpdf
% % pdf code
% \else
% % dvi code
% \fi
% The latest version of ifpdf.sty can be obtained from:
% http://www.ctan.org/tex-archive/macros/latex/contrib/oberdiek/
% Also, note that IEEEtran.cls V1.7 and later provides a builtin
% \ifCLASSINFOpdf conditional that works the same way.
% When switching from latex to pdflatex and vice-versa, the compiler may
% have to be run twice to clear warning/error messages.
\usepackage{graphicx}
\usepackage[space]{grffile}
\usepackage{latexsym}
\usepackage{textcomp}
\usepackage{longtable}
\usepackage{tabulary}
\usepackage{booktabs,array,multirow}
\usepackage{amsfonts,amsmath,amssymb}
\providecommand\citet{\cite}
\providecommand\citep{\cite}
\providecommand\citealt{\cite}
\usepackage{url}
\usepackage{hyperref}
\hypersetup{colorlinks=false,pdfborder={0 0 0}}
\usepackage{etoolbox}
\makeatletter
\patchcmd\@combinedblfloats{\box\@outputbox}{\unvbox\@outputbox}{}{%
\errmessage{\noexpand\@combinedblfloats could not be patched}%
}%
\makeatother
% You can conditionalize code for latexml or normal latex using this.
\newif\iflatexml\latexmlfalse
\AtBeginDocument{\DeclareGraphicsExtensions{.pdf,.PDF,.eps,.EPS,.png,.PNG,.tif,.TIF,.jpg,.JPG,.jpeg,.JPEG}}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
% *** CITATION PACKAGES ***
%
\usepackage{cite}
% cite.sty was written by Donald Arseneau
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
% \cite{} output to follow that of IEEE. Loading the cite package will
% result in citation numbers being automatically sorted and properly
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
% \cite will automatically add leading space, if needed. Use cite.sty's
% noadjust option (cite.sty V3.8 and later) if you want to turn this off.
% cite.sty is already installed on most LaTeX systems. Be sure and use
% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
% not currently provide for hyperlinked citations.
% The latest version can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
% The documentation is contained in the cite.sty file itself.
% *** MATH PACKAGES ***
%
%\usepackage[cmex10]{amsmath}
% A popular package from the American Mathematical Society that provides
% many useful and powerful commands for dealing with mathematics. If using
% it, be sure to load this package with the cmex10 option to ensure that
% only type 1 fonts will utilized at all point sizes. Without this option,
% it is possible that some math symbols, particularly those within
% footnotes, will be rendered in bitmap form which will result in a
% document that can not be IEEE Xplore compliant!
%
% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
% thus preventing page breaks from occurring within multiline equations. Use:
%\interdisplaylinepenalty=2500
% after loading amsmath to restore such page breaks as IEEEtran.cls normally
% does. amsmath.sty is already installed on most LaTeX systems. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/
% *** SPECIALIZED LIST PACKAGES ***
%
%\usepackage{algorithmic}
% algorithmic.sty was written by Peter Williams and Rogerio Brito.
% This package provides an algorithmic environment fo describing algorithms.
% You can use the algorithmic environment in-text or within a figure
% environment to provide for a floating algorithm. Do NOT use the algorithm
% floating environment provided by algorithm.sty (by the same authors) or
% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
% algorithm float types and packages that provide these will not provide
% correct IEEE style captions. The latest version and documentation of
% algorithmic.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
% There is also a support site at:
% http://algorithms.berlios.de/index.html
% Also of interest may be the (relatively newer and more customizable)
% algorithmicx.sty package by Szasz Janos:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/
% *** ALIGNMENT PACKAGES ***
%
%\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
% the standard LaTeX2e array and tabular environments to provide better
% appearance and additional user controls. As the default LaTeX2e table
% generation code is lacking to the point of almost being broken with
% respect to the quality of the end results, all users are strongly
% advised to use an enhanced (at the very least that provided by array.sty)
% set of table tools. array.sty is already installed on most systems. The
% latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/
%\usepackage{mdwmath}
%\usepackage{mdwtab}
% Also highly recommended is Mark Wooding's extremely powerful MDW tools,
% especially mdwmath.sty and mdwtab.sty which are used to format equations
% and tables, respectively. The MDWtools set is already installed on most
% LaTeX systems. The lastest version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/
% IEEEtran contains the IEEEeqnarray family of commands that can be used to
% generate multiline equations as well as matrices, tables, etc., of high
% quality.
%\usepackage{eqparbox}
% Also of notable interest is Scott Pakin's eqparbox package for creating
% (automatically sized) equal width boxes - aka "natural width parboxes".
% Available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/
% *** SUBFIGURE PACKAGES ***
%\usepackage[tight,footnotesize]{subfigure}
% subfigure.sty was written by Steven Douglas Cochran. This package makes it
% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
% work, it is a good idea to load it with the tight package option to reduce
% the amount of white space around the subfigures. subfigure.sty is already
% installed on most LaTeX systems. The latest version and documentation can
% be obtained at:
% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
% subfigure.sty has been superceeded by subfig.sty.
%\usepackage[caption=false]{caption}
%\usepackage[font=footnotesize]{subfig}
% subfig.sty, also written by Steven Douglas Cochran, is the modern
% replacement for subfigure.sty. However, subfig.sty requires and
% automatically loads Axel Sommerfeldt's caption.sty which will override
% IEEEtran.cls handling of captions and this will result in nonIEEE style
% figure/table captions. To prevent this problem, be sure and preload
% caption.sty with its "caption=false" package option. This is will preserve
% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later
% (recommended due to many improvements over 1.2) of subfig.sty supports
% the caption=false option directly:
%\usepackage[caption=false,font=footnotesize]{subfig}
%
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
% The latest version and documentation of caption.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/
% *** FLOAT PACKAGES ***
%
%\usepackage{fixltx2e}
% fixltx2e, the successor to the earlier fix2col.sty, was written by
% Frank Mittelbach and David Carlisle. This package corrects a few problems
% in the LaTeX2e kernel, the most notable of which is that in current
% LaTeX2e releases, the ordering of single and double column floats is not
% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
% single column figure to be placed prior to an earlier double column
% figure. The latest version and documentation can be found at:
% http://www.ctan.org/tex-archive/macros/latex/base/
%\usepackage{stfloats}
% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
% the ability to do double column floats at the bottom of the page as well
% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
% LaTeX2e). It also provides a command:
%\fnbelowfloat
% to enable the placement of footnotes below bottom floats (the standard
% LaTeX2e kernel puts them above bottom floats). This is an invasive package
% which rewrites many portions of the LaTeX2e float routines. It may not work
% with other packages that modify the LaTeX2e float routines. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
% Documentation is contained in the stfloats.sty comments as well as in the
% presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
% does not allow \baselineskip to stretch. Authors submitting work to the
% IEEE should note that IEEE rarely uses double column equations and
% that authors should try to avoid such use. Do not be tempted to use the
% cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
% not format its papers in such ways.
\usepackage{amsmath}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\begin{document}
%
% paper title
% can use linebreaks \\ within to get better formatting as desired
% Do not put math or special symbols in the title.
\title{Parallel implementations of a TV-$L^1$ image-denoising algorithm}
\author{%
\IEEEauthorblockN{%
Christopher Prince}% <-this % stops a space
\IEEEauthorblockA{%
NYU Center for Urban Science \& Progress% <-this % stops a space
}}
% make the title area
\maketitle
% As a general rule, do not put math, special symbols or citations
% in the abstract
\selectlanguage{english}
\begin{abstract}
I describe parallel implementations of a total variational image-denoising algorithm. Specifically, the primal-dual formulation of the TV-$L^1$ model is implemented in both the shared memory and distributed memory paradigms using the Open Multi-Processing (OpenMP) API and the Message Passing Interface (MPI), respectively. Experiments on the Stampede supercomputer show good weak and strong scaling performance. Weak scaling performance suggests that the shared memory implementation may be particularly suited to very large problems where the overhead associated with passing and receiving messages is much less than the local work. A short discussion on applications follows.%
\end{abstract}%
% no keywords
% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex). ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.
% (Unless specifically asked to do so by the journal or conference you plan
% to submit to, of course. )
\selectlanguage{english}
\begin{abstract}
I describe parallel implementations of a total variational image-denoising algorithm. Specifically, the primal-dual formulation of the TV-$L^1$ model is implemented in both the shared memory and distributed memory paradigms using the Open Multi-Processing (OpenMP) API and the Message Passing Interface (MPI), respectively. Experiments on the Stampede supercomputer show good weak and strong scaling performance. Weak scaling performance suggests that the distributed implementation may be particularly suited to very large problems where the overhead associated with passing and receiving messages is much less than the local work.
\end{abstract}
\section{Introduction}
Imaging acquisition involves many hardware and software stages that introduce error sources. This is seen as visual artifacts in the image, typically recognized as noise. This can be especially noticeable in images acquired with low levels of illumination, such as night photography.
Two common manifestations of noise in digital images are Gaussian noise and salt-and-pepper noise. Gaussian noise is typically associated with errors in detection. It produces pixel values that vary within a quasi-normally distributed range about the "true" value at that point in the image. Salt-and-pepper noise typically arises from transmission errors, and the pixel value is recorded as either fully on or fully off (in grayscale, white or black). Removing these artifacts can be desirable from an aesthetic perspective or in order to pre-process images for other workflows. Common techniques to address this include Gaussian blurring, which takes the convolution of an image with a Gaussian kernel window, and median filtering, which replaces pixels with the median value of a sliding window. These techniques can reduce noise; however, they are also susceptible to blurring edges of features.
The total variation technique was introduced in 1992 by Rudin, Osher and Fatemi (ROF) \hyperref[csl:1]{[1]} as an alternative denoising method. The method works by iteratively constructing a function $u$ on a domain $\Omega$ that minimizes its energy with an input function $f$:
\begin{equation}
\min_{u}\int_{\Omega}\left\Vert{\nabla{u}}\right\Vert
+ \lambda\int_{\Omega}(u-f)^2
\end{equation}
with $\Vert\cdot\Vert$ being the $L^2$ norm. The first term is the total variation of the image. It is a regularizer for the minimization function. The second term also includes an $L^2$ norm, which means that the problem is convex and has a unique solution \hyperref[csl:2]{[2]}.
Replacing the second term above with an $L^1$ norm leads to the TV-$L^1$ model:
\begin{equation}
\min_{u}\int_{\Omega}\left\Vert{\nabla{u}}\right\Vert
+ \lambda\int_{\Omega}\left\vert{u-f}\right\vert
\end{equation}
The TV-$L^1$ model is not a (necessarily) convex model; however, when applied to discrete images, the model tends to offer better performance at removing salt-and-pepper noise from images than the ROF model and is thus an important image processing technique. It is a standard method implemented in many computer vision packages, including OpenCV \hyperref[csl:3]{[3]}.
\section{Algorithm}
\makeatletter
\def\BState{\State\hskip-\ALG@thistlm}
\makeatother
For discrete images in a TV-$L^1$ model, $\Omega \subset \mathbb{R}^2$ and the integrals become sums over the pixels. The discretization of the TV-$L^1$ problem is well suited for parallelization. At all stages of the calculation, only immediate neighbors are needed. Each stage of the calculation is also completely inside of a loop over each pixel, making the shared memory OpenMP implementation particularly straightforward.
A pseudocode representation of the algorithm is given below. The interpretation of the parameters are not included here; the function descriptions are provided to illustrate the algorithm dependencies. For full details of the parameters, see \hyperref[csl:4]{[4]}.
\begin{algorithm}
\caption{{Primal-dual TV-$L^1$ denoising (after Chambolle and Pock)}}\label{euclid}
\begin{algorithmic}[1]
\Procedure{TVL1}{}
\State $X_0 \gets image$
\State $(px, py) \gets \nabla(X_0)$
\For{$iter := 0, maxiter$}
\ForAll{$(i,j) \in X_{iter}$}:
\State $(dx, dy) \gets \nabla(X_{iter})$
\State $(px, py) \gets \textbf{\textrm{project}}(px + \sigma dx, py + \sigma dy)$
\State $nablat \gets \nabla^T(px, py)$
\State $X^\prime \gets \textbf{\textrm{shrink}}(X_{iter} - \tau nablat, \lambda\tau)$
\State $X_{iter+1} \gets X_{iter} + (X_{iter} - X^\prime)$
\EndFor
\EndFor
\Return $X_{maxiter}$\Comment{The final update.}
\EndProcedure
\end{algorithmic}
\end{algorithm}
The calculation of the gradient for an image $u$ on $i$ by $j$ pixels is given by
\begin{equation}
\nabla{u_{i,j}} = \left(\begin{matrix}dx_{i,j} \\ dy_{i,j}\end{matrix}\right) = \left(\begin{matrix}u_{i+1,j} - u_{i,j} \\ u_{i,j+1} - u_{i,j}\end{matrix}\right)
\end{equation}
The \textbf{project} function takes the partial gradients $dx$ and $dy$ and projects them into a unit circle: If the length of the local gradient is larger than unity, the gradient is clipped by the unit circle; otherwise it is unchanged.
\begin{subequations}
\begin{align}
\textbf{anorm}_{i,j} &= \sqrt{dx^2_{i,j} + dy^2_{i,j}} \\
\textbf{project}\left(\begin{matrix}dx_{i,j} \\ dy_{i,j}\end{matrix}\right) &=
\begin{cases}
\left(\begin{matrix}dx_{i,j} \\ dy_{i,j}\end{matrix}\right) & \textbf{anorm}_{i,j}\leq 1 \\
\frac{1}{\textbf{anorm}_{i,j}} \left(\begin{matrix}dx_{i,j} \\ dy_{i,j}\end{matrix}\right) & \textrm{otherwise}
\end{cases}
\end{align}
\end{subequations}
The reverse transformation from projected gradients to pixel values is constructed directly from the gradient definition above.
\begin{equation}
\nabla^T\left(\begin{matrix}dx_{i,j} \\dy_{i,j}\end{matrix}\right) = dx_{i-1,j}-dx_{i,j} + dy_{i,j-1} -dy_{i,j}
\end{equation}
Finally, the \textbf{shrink} function clips the allowable difference of the transformed image from the $iter$-th solution $X_{iter}$. The solution is then updated from this clipped value.
\section{Implementation and Methodology}
I wrote the implementations in C, following the general scheme above and using the nomenclature (also used in the function descriptions above) from a Python implementation written by Alexander Mordvintsev \hyperref[csl:5]{[5]}. Images are passed into the routine and the results are written out as PPM files using ${\tt ppma\_io.c}$ by John Burkardt \hyperref[csl:6]{[6]}. These reads and writes from disk are not included in the timing of the codes, but writing the results out is useful for validating that the filtering operation performs as expected.
When implemented using MPI, it is necessary to buffer the data in a way that allows for border values to be sent between processors. The general scheme for this buffering is illustrated in Figure \ref{fig:CellLayout}. Only the gradient operation and its inverse require communication steps.
The results of two MPI implementations are presented here. The first uses non-blocking communication to send and receive messages, and it initializes (with zero) the entries of the output arrays. The second also uses non-blocking communications, but does more interleaving of operations by performing all calculations in the array interior before processing the receive messages and calculating the array borders.\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/Screenshot-from-2017-05-13-01-11-19/Screenshot-from-2017-05-13-01-25-27}
\caption{{\label{fig:CellLayout} Arrangement of data for subproblems in a distributed memory implementation. To calculate the gradients $dx$ and $dy$, ghosts are needed below and to the right of the subproblem borders. For the reverse transformation, ghosts are needed above and to the left of the subproblem borders.%
}}
\end{center}
\end{figure}
The indexing of the matrices is done in one dimension, and the location in the matrix is calculated based on the data held in the matrix. Gradients and their projections must account for the ghost cells in the first column and first row, and images and their transformations must account for the ghost cells in the last column and last row.
To test the performance of the parallel implementations, it is necessary to measure both weak scaling and strong scaling. For weak scaling, I chose two different work sizes per processor to identify any noticeable difference in scaling with varying computation to communication ratio. The first work size is $64 \times 64$ pixels per processor. The source image is derived from the $512 \times 512$ "ascent" public domain reference image included in the Python {\tt SciPy} package \hyperref[csl:7]{[7]}. Although not strictly necessary to test the performance of the algorithm, both Gaussian and salt-and-pepper noise were added to the reference image so that the accuracy of the implementation could be compared against \hyperref[csl:5]{[5]}. The image is truncated in each dimension by multiples of 64 pixels, and the test is performed on $\{i^2 | i \in \mathbb{Z}^+, i \leq 8\}$ processors for the MPI implementations and $\{i^2 | i \in \mathbb{Z}^+, i \leq 4\}$ for the OpenMP implementation. This latter restriction is necessitated by the limitation on the number of processes per node.\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/img-obs/img-obs}
\caption{{\label{fig:NoisySteps} $512 \times 512$ pixel source image used for the "small" ($64 \times 64$ pixel per processor) weak scaling test.%
}}
\end{center}
\end{figure}\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/output-cpu/output-cpu}
\caption{{\label{fig:cleansteps} The result after 100 iterations of the the TV-$L^1$ denoising filter applied to Figure \ref{fig:NoisySteps}. This output comes from the OpenMP implementation tested in this report.%
}}
\end{center}
\end{figure}\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/uo/uo}
\caption{{\label{fig:uo} $1530 \times 2444$ pixel source image used for the strong scaling test and "large" ($190 \times 190$ pixel per processor) weak scaling test.%
}}
\end{center}
\end{figure}
For the "large" weak scaling test, the work size is $190 \times 190$ pixels per processor. The source image for this test comes from the Urban Observatory (UO), run by the NYU Center for Urban Science and Progress. The image is sized $1530 \times 2444$ pixels with three color channels. This nighttime image shows a large amount of noise, especially in the red channel. It is converted to a single-channel grayscale image by taking the mean across the three channels. The image is then truncated to $8 \times 190 = 1520$ pixels per side, and then truncated again in each dimension by multiples of 190 pixels for each of the same processor counts as in the small weak scaling test.
For the strong scaling test, the full size $1530 \times 2444$ pixel grayscale image is used, and each implementation is tested on the same numbers of processors as in the weak scaling tests. One caveat of the strong scaling test performed here is that for the MPI implementation, if the number of processors assigned to each side of the image does not evenly divide the number of pixels per side of the image, then the additional rows and columns are truncated. For this image and the sizes tested, there are at most four rows or columns truncated, so the reduction in work size is minimal. No truncation is performed for the shared memory OpenMP implementation.
The tests were run on the Stampede supercomputer at the University of Texas. Stampede is a cluster featuring 16 processors (eight cores per each of two Xeon E5-2680 Sandy Bridge CPUs) and 32 GB DDR3 RAM per node, connected with a high-speed InfiniBand network. For tests on more than 16 processors, a job for four nodes was submitted, and for 16 or fewer processors, a separate job requesting a single node was submitted. Each test was performed with 100 iterations of the algorithm. Larger scale results are not reported here as the time to execute production queue jobs was reported as nearly 10 days when they were submitted. However, this document will be updated when the results are available.
All code was compiled on Stampede using ${\tt gcc}$ version 4.4.7 for OpenMP code and ${\tt mpicc}$ version 15.0.2 for MPI code. The following compiler flags were used for all timed test runs: $\verb|-lm -lrt -O3 -std=gnu99| $. The $\verb| -fopenmp| $ flag was also added to the OpenMP compilations. The MPI code was compiled against the MVAPICH2 library version 2.1, which is optimized to run on the high-performance network that undergirds Stampede.
\section{Results}
The weak scaling results for the small and large size tests are shown in Figures \ref{fig:weaksmall} and \ref{fig:weaklarge}. For the weak scaling results, the OpenMP implementation performs better than simple linear scaling of the single processor performance; however, it is not as quick as either of the MPI implementations. For the small work size, there is an easily seen communication penalty when the number of processors necessitates the use of more than one node. Once that communication penalty is incurred, though, the slope of the scaling is similar to the slope of the scaling on a single node.\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/Screenshot-from-2017-05-13-17-15-54/weak64}
\caption{{\label{fig:weaksmall} Weak scaling results for the small work size. The x-axis indicates the number of processors used, and the y-axis indicates the time in seconds per iteration of the computation. The MPI Interleaved data points lie on top of the MPI data, so the latter are not visible here.%
}}
\end{center}
\end{figure}\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/Screenshot-from-2017-05-13-17-17-08/weak190}
\caption{{\label{fig:weaklarge} Weak scaling results for the large work size. The x-axis indicates the number of processors used, and the y-axis indicates the time in seconds per iteration of the computation.%
}}
\end{center}
\end{figure}
The large weak scaling test shows a similar gap in performance between the OpenMP and MPI implementations. The large penalty seen in the small test due to internode communication is largely eliminated. This, and the lack of any noticeable improvement in timings between the merely initialized and fully interleaved variations of the MPI code, indicates that keeping the processors busy with larger subproblem sizes can effectively hide the latency of the network messages.
Future work on this project will include reporting on larger problem sizes and processor counts. A pure non-blocking implementation was also written, but not tested, and may provide interesting results given the latency result noted above. A hybrid OpenMP + MPI implementation could also be investigated to explore the gap between the performance of the shared and distributed memory versions.
All code for this project is available in a public repository at \hyperref[csl:8]{[8]}.\selectlanguage{english}
\begin{figure}[h!]
\begin{center}
\includegraphics[width=0.70\columnwidth]{figures/Screenshot-from-2017-05-13-18-22-41/strong}
\caption{{\label{fig:strong} Strong scaling results for the UO test image. The x-axis indicates the number of processors used, and the y-axis (plotted logarithmically) indicates the time in seconds per iteration of the computation.%
}}
\end{center}
\end{figure}
\section*{Acknowledgements}
I thank Georg Stadler and Bill Bao for their initial feedback on my project proposal and for literature recommendations. I also thank Federica Bianco for supplying images from the CUSP Urban Observatory for use in this paper.
\selectlanguage{english}
\FloatBarrier
\section*{References}\sloppy
\phantomsection
\label{csl:1}[1]L. I. Rudin, S. Osher, and E. Fatemi, ``{Nonlinear total variation based noise removal algorithms}'', \textit{Physica D: Nonlinear Phenomena}, vol. 60, no. 1-4, pp. 259–268, Nov. 1992, doi: 10.1016/0167-2789(92)90242-f. [Online]. Available at: \url{https://doi.org/10.1016\%2F0167-2789\%2892\%2990242-f}
\phantomsection
\label{csl:2}[2]A. Chambolle, V. Caselles, M. Novaga, D. Cremers, and T. Pock, ``{An introduction to Total Variation for Image Analysis}'', Nov-2009 [Online]. Available at: \url{https://hal.archives-ouvertes.fr/hal-00437581}
\phantomsection
\label{csl:3}[3]``{opencv: {Open} {Source} {Computer} {Vision} {Library}}''. OpenCV, May-2017 [Online]. Available at: \url{https://github.com/opencv/opencv.} [Accessed: 13-May-2017]
\phantomsection
\label{csl:4}[4]A. Chambolle and T. Pock, ``{A First-Order Primal-Dual Algorithm for Convex Problems with~Applications to Imaging}'', \textit{Journal of Mathematical Imaging and Vision}, vol. 40, no. 1, pp. 120–145, Dec. 2010, doi: 10.1007/s10851-010-0251-1. [Online]. Available at: \url{https://doi.org/10.1007\%2Fs10851-010-0251-1}
\phantomsection
\label{csl:5}[5]A. Mordvintsev, ``{{ROF} and {TV}-{L}1 denoising with {Primal}-{Dual} algorithm}''. Jun-2013 [Online]. Available at: \url{http://znah.net/rof-and-tv-l1-denoising-with-primal-dual-algorithm.html.} [Accessed: 13-May-2017]
\phantomsection
\label{csl:6}[6]J. Burkardt, ``{ppma\_io.c}''. 1998 [Online]. Available at: \url{https://people.sc.fsu.edu/~jburkardt/c_src/ppma_io/ppma_io.c.} [Accessed: 14-May-2017]
\phantomsection
\label{csl:7}[7]E. Jones, T. Oliphant, P. Peterson, and others, ``{{SciPy}: Open source scientific tools for {Python}}''. 2001-- [Online]. Available at: \url{http://www.scipy.org/}
\phantomsection
\label{csl:8}[8]C. Prince, ``{hpc17: {Coursework} for {NYU} {HPC} class}''. Feb-2017 [Online]. Available at: \url{https://github.com/cmprince/hpc17.} [Accessed: 14-May-2017]
\end{document}