Authorea

@article{blas,
  title={Basic linear algebra subprograms for Fortran usage},
  author={Lawson, Chuck L and Hanson, Richard J. and Kincaid, David R and Krogh, Fred T.},
  journal={ACM Transactions on Mathematical Software (TOMS)},
  volume={5},
  number={3},
  pages={308--323},
  year={1979},
  publisher={ACM}
}

@misc{cudadoc,
  author = {{NVIDIA}},
  timestamp = {2011-05-06T13:36:56.000+0200},
  title = {{CUDA C} Programming Guide},
  url = {https://docs.nvidia.com/cuda/cuda-c-programming-guide/},
  note = {Accessed: 2015-09-27},
  year = {2015}
}



@ARTICLE{flynn,
author={Flynn, M.},
journal={Proceedings of the IEEE},
title={Very high-speed computing systems},
year={1966},
volume={54},
number={12},
pages={1901-1909},
keywords={Arithmetic;Art;Computer aided instruction;Hardware;Impedance matching;Large-scale systems;Pervasive computing;Scientific computing;Turing machines},
doi={10.1109/PROC.1966.5273},
ISSN={0018-9219},
month={Dec},}

@article{survey,
author = {Don Heller},
title = {A Survey of Parallel Algorithms in Numerical Linear Algebra},
journal = {SIAM Review},
volume = {20},
number = {4},
pages = {740-777},
year = {1978},
doi = {10.1137/1020096},

URL = { 
        http://dx.doi.org/10.1137/1020096
},
eprint = { 
        http://dx.doi.org/10.1137/1020096
}
}

@misc{cublas,
  title={{cuBLAS} library},
  author={{NVIDIA}},
  url={https://developer.nvidia.com/cuBLAS},
  year={2008},
  note = {Accessed: 2015-09-27}
}

@article{viennacl,
  title={{ViennaCL-a high level linear algebra library for GPUs and multi-core CPUs}},
  author={Rupp, Karl and Rudolf, Florian and Weinbub, Josef},
  journal={Proc. GPUScA},
  pages={51--56},
  year={2010}
}

@inproceedings{magma,
  title={{Numerical linear algebra on emerging architectures: The PLASMA and MAGMA projects}},
  author={Agullo, Emmanuel and Demmel, Jim and Dongarra, Jack and Hadri, Bilel and Kurzak, Jakub and Langou, Julien and Ltaief, Hatem and Luszczek, Piotr and Tomov, Stanimire},
  booktitle={Journal of Physics: Conference Series},
  volume={180},
  number={1},
  pages={012037},
  year={2009},
  organization={IOP Publishing}
}



@article{cula,
author = {Humphrey, John R. and Price, Daniel K. and Spagnoli, Kyle E. and Paolini, Aaron L. and Kelmelis, Eric J.},
title = {{CULA: hybrid GPU accelerated linear algebra routines}},
journal = {Proc. SPIE},
volume = {7705},
number = {},
pages = {770502-770502-7},
abstract = {The modern graphics processing unit (GPU) found in many standard personal computers is a highly parallel math processor capable of nearly 1 TFLOPS peak throughput at a cost similar to a high-end CPU and an excellent FLOPS/watt ratio. High-level linear algebra operations are computationally intense, often requiring O(N3) operations and would seem a natural fit for the processing power of the GPU. Our work is on CULA, a GPU accelerated implementation of linear algebra routines. We present results from factorizations such as LU decomposition, singular value decomposition and QR decomposition along with applications like system solution and least squares. The GPU execution model featured by NVIDIA GPUs based on CUDA demands very strong parallelism, requiring between hundreds and thousands of simultaneous operations to achieve high performance. Some constructs from linear algebra map extremely well to the GPU and others map poorly. CPUs, on the other hand, do well at smaller order parallelism and perform acceptably during low-parallelism code segments. Our work addresses this via hybrid a processing model, in which the CPU and GPU work simultaneously to produce results. In many cases, this is accomplished by allowing each platform to do the work it performs most naturally.},
year = {2010},
doi = {10.1117/12.850538},
URL = { http://dx.doi.org/10.1117/12.850538},
eprint = {}
}

@article{sparse,
  title={Optimizing sparse matrix-vector multiplication on GPUs using compile-time and run-time strategies},
  author={Baskaran, Muthu Manikandan and Bordawekar, Rajesh},
  journal={IBM Reserach Report, RC24704 (W0812-047)},
  year={2008}
}

@techreport{bell2008efficient,
  title={Efficient sparse matrix-vector multiplication on CUDA},
  author={Bell, Nathan and Garland, Michael},
  year={2008},
  institution={Nvidia Technical Report NVR-2008-004, Nvidia Corporation}
}

@inproceedings{efficiency,
 author = {Fatahalian, K. and Sugerman, J. and Hanrahan, P.},
 title = {Understanding the Efficiency of GPU Algorithms for Matrix-matrix Multiplication},
 booktitle = {Proceedings of the ACM SIGGRAPH/EUROGRAPHICS Conference on Graphics Hardware},
 series = {HWWS '04},
 year = {2004},
 isbn = {3-905673-15-0},
 location = {Grenoble, France},
 pages = {133--137},
 numpages = {5},
 url = {http://doi.acm.org/10.1145/1058129.1058148},
 doi = {10.1145/1058129.1058148},
 acmid = {1058148},
 publisher = {ACM},
 address = {New York, NY, USA},
} 


@inproceedings{volkov2008benchmarking,
  title={Benchmarking GPUs to tune dense linear algebra},
  author={Volkov, Vasily and Demmel, James W},
  booktitle={High Performance Computing, Networking, Storage and Analysis, 2008. SC 2008. International Conference for},
  pages={1--11},
  year={2008},
  organization={IEEE}
}