{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T15:31:13Z","timestamp":1772119873552,"version":"3.50.1"},"reference-count":20,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T00:00:00Z","timestamp":1710201600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T00:00:00Z","timestamp":1710201600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100000780","name":"European Commission","doi-asserted-by":"crossref","award":["95555"],"award-info":[{"award-number":["95555"]}],"id":[{"id":"10.13039\/501100000780","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100000780","name":"European Union","doi-asserted-by":"crossref","award":["95555"],"award-info":[{"award-number":["95555"]}],"id":[{"id":"10.13039\/501100000780","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100011011","name":"Junta de Andaluc\u00eda","doi-asserted-by":"publisher","award":["POSTDOC_21_00025"],"award-info":[{"award-number":["POSTDOC_21_00025"]}],"id":[{"id":"10.13039\/501100011011","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100011033","name":"Agencia Estatal de Investigaci\u00f3n","doi-asserted-by":"crossref","award":["FJC2019-039222"],"award-info":[{"award-number":["FJC2019-039222"]}],"id":[{"id":"10.13039\/501100011033","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100011033","name":"Agencia Estatal de Investigaci\u00f3n","doi-asserted-by":"crossref","award":["PID2020-113656R"],"award-info":[{"award-number":["PID2020-113656R"]}],"id":[{"id":"10.13039\/501100011033","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100003359","name":"Generalitat Valenciana","doi-asserted-by":"crossref","award":["CIDEXG\/2022\/013"],"award-info":[{"award-number":["CIDEXG\/2022\/013"]}],"id":[{"id":"10.13039\/501100003359","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100003359","name":"Generalitat Valenciana","doi-asserted-by":"crossref","award":["PROMETEO 2023-CIPROM\/2022\/2"],"award-info":[{"award-number":["PROMETEO 2023-CIPROM\/2022\/2"]}],"id":[{"id":"10.13039\/501100003359","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100004233","name":"Universitat Polit\u00e8cnica de Val\u00e8ncia","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100004233","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2024,7]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:p>\n                    General matrix multiplication (\n                    <jats:sc>gemm<\/jats:sc>\n                    ) is a fundamental kernel in scientific computing and current frameworks for deep learning. Modern realisations of\n                    <jats:sc>gemm<\/jats:sc>\n                    are mostly written in C, on top of a small, highly tuned\n                    <jats:italic>micro-kernel<\/jats:italic>\n                    that is usually encoded in assembly. The high performance realisation of\n                    <jats:sc>gemm<\/jats:sc>\n                    in linear algebra libraries in general include a single micro-kernel per architecture, usually implemented by an expert. In this paper, we explore a couple of paths to automatically generate\n                    <jats:sc>gemm<\/jats:sc>\n                    micro-kernels, either using C++ templates with vector intrinsics or high-level Python scripts that directly produce assembly code. Both solutions can integrate high performance software techniques, such as loop unrolling and software pipelining, accommodate any data type, and easily generate micro-kernels of any requested dimension. The performance of this solution is tested on three ARM-based cores and compared with state-of-the-art libraries for these processors: BLIS, OpenBLAS and ArmPL. The experimental results show that the auto-generation approach is highly competitive, mainly due to the possibility of adapting the micro-kernel to the problem dimensions.\n                  <\/jats:p>","DOI":"10.1007\/s11227-024-05955-8","type":"journal-article","created":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T20:02:37Z","timestamp":1710273757000},"page":"13873-13899","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Automatic generation of ARM NEON micro-kernels for matrix multiplication"],"prefix":"10.1007","volume":"80","author":[{"given":"Guillermo","family":"Alaejos","sequence":"first","affiliation":[]},{"given":"H\u00e9ctor","family":"Mart\u00ednez","sequence":"additional","affiliation":[]},{"given":"Adri\u00e1n","family":"Castell\u00f3","sequence":"additional","affiliation":[]},{"given":"Manuel F.","family":"Dolz","sequence":"additional","affiliation":[]},{"given":"Francisco D.","family":"Igual","sequence":"additional","affiliation":[]},{"given":"Pedro","family":"Alonso-Jord\u00e1","sequence":"additional","affiliation":[]},{"given":"Enrique S.","family":"Quintana-Ort\u00ed","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,3,12]]},"reference":[{"issue":"1","key":"5955_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/77626.79170","volume":"16","author":"JJ Dongarra","year":"1990","unstructured":"Dongarra JJ, Du Croz J, Hammarling S, Duff I (1990) A set of level 3 basic linear algebra subprograms. ACM Trans Math Softw 16(1):1\u201317","journal-title":"ACM Trans Math Softw"},{"issue":"3","key":"5955_CR2","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1145\/292395.292412","volume":"24","author":"B K\u00e5gstr\u00f6m","year":"1998","unstructured":"K\u00e5gstr\u00f6m B, Ling P, van Loan C (1998) GEMM-based level 3 BLAS: High-performance model implementations and performance evaluation benchmark. ACM Trans Math Softw 24(3):268\u2013302","journal-title":"ACM Trans Math Softw"},{"issue":"1","key":"5955_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1377603.1377607","volume":"35","author":"K Goto","year":"2008","unstructured":"Goto K, van de Geijn R (2008) High-performance implementation of the level-3 BLAS. ACM Trans Math Soft 35(1):1\u201314","journal-title":"ACM Trans Math Soft"},{"issue":"12","key":"5955_CR4","doi-asserted-by":"publisher","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","volume":"105","author":"V Sze","year":"2017","unstructured":"Sze V, Chen Y-H, Yang T-J, Emer JS (2017) Efficient processing of deep neural networks: a tutorial and survey. Proc IEEE 105(12):2295\u20132329","journal-title":"Proc IEEE"},{"issue":"4","key":"5955_CR5","first-page":"65:1","volume":"52","author":"T Ben-Nun","year":"2019","unstructured":"Ben-Nun T, Hoefler T (2019) Demystifying parallel and distributed deep learning: an in-depth concurrency analysis. ACM Comput Surv 52(4):65:1-65:43","journal-title":"ACM Comput Surv"},{"issue":"3","key":"5955_CR6","doi-asserted-by":"publisher","first-page":"12:1","DOI":"10.1145\/1356052.1356053","volume":"34","author":"K Goto","year":"2008","unstructured":"Goto K, van de Geijn RA (2008) Anatomy of a high-performance matrix multiplication. ACM Trans Math Softw 34(3):12:1-12:25","journal-title":"ACM Trans Math Softw"},{"issue":"3","key":"5955_CR7","doi-asserted-by":"crossref","first-page":"14:1","DOI":"10.1145\/2764454","volume":"41","author":"FG Van Zee","year":"2015","unstructured":"Van Zee FG, van de Geijn RA (2015) BLIS: a framework for rapidly instantiating BLAS functionality. ACM Trans Math Softw 41(3):14:1-14:33","journal-title":"ACM Trans Math Softw"},{"key":"5955_CR8","unstructured":"OpenBLAS, http:\/\/xianyi.github.com\/OpenBLAS\/ (2012)"},{"key":"5955_CR9","doi-asserted-by":"publisher","first-page":"8124","DOI":"10.1007\/s11227-022-05003-3","volume":"79","author":"G Alaejos","year":"2023","unstructured":"Alaejos G, Castell\u00f3 A, Mart\u00ednez H, Alonso-Jord\u00e1 P, Igual FD, Quintana-Ort\u00ed ES (2023) Micro-kernels for portable and efficient matrix multiplication in deep learning. J Supercomput 79:8124\u20138147","journal-title":"J Supercomput"},{"key":"5955_CR10","unstructured":"Mart\u00ednez H, Catal\u00e1n S, Igual FD, Herrero JR, Rodr\u00edguez-S\u00e1nchez R, Quintana-Ort\u00ed ES (2023) Co-design of the dense linear algebra software stack for multicore processors, arXiv:2304.14480"},{"issue":"2","key":"5955_CR11","first-page":"12:1","volume":"43","author":"TM Low","year":"2016","unstructured":"Low TM, Igual FD, Smith TM, Quintana-Ort\u00ed ES (2016) Analytical modeling is enough for high-performance BLIS. ACM Trans Math Softw 43(2):12:1-12:18","journal-title":"ACM Trans Math Softw"},{"key":"5955_CR12","doi-asserted-by":"publisher","unstructured":"Williams S, Waterman A, Patterson D (2009) Roofline: an insightful visual performance model for multicore architectures. Commun ACM 52(4):65\u201376. https:\/\/doi.org\/10.1145\/1498765.1498785","DOI":"10.1145\/1498765.1498785"},{"key":"5955_CR13","doi-asserted-by":"publisher","unstructured":"Zee FGV, Smith TM, Marker B, Low TM, Geijn RAVD, Igual FD, Smelyanskiy M, Zhang X, Kistler M, Austel V, Gunnels JA, Killough L (2016) The BLIS framework: Experiments in portability. ACM Trans Math Softw 42(2). https:\/\/doi.org\/10.1145\/2755561","DOI":"10.1145\/2755561"},{"issue":"3","key":"5955_CR14","doi-asserted-by":"publisher","first-page":"1037","DOI":"10.1007\/s10586-016-0611-8","volume":"19","author":"S Catal\u00e1n","year":"2016","unstructured":"Catal\u00e1n S, Igual FD, Mayo R, Rodr\u00edguez-S\u00e1nchez R, Quintana-Ort\u00ed ES (2016) Architecture-aware configuration and scheduling of matrix multiplication on asymmetric multicore processors. Clust Comput 19(3):1037\u20131051","journal-title":"Clust Comput"},{"key":"5955_CR15","unstructured":"Dowd K, Severance CR (1998) High performance computing, 2nd edn. O\u2019Reilly"},{"issue":"C","key":"5955_CR16","doi-asserted-by":"publisher","first-page":"240","DOI":"10.1016\/j.jpdc.2022.05.009","volume":"167","author":"S Barrachina","year":"2022","unstructured":"Barrachina S, Dolz MF, San Juan P, Quintana-Ort\u00ed ES (2022) Efficient and portable GEMM-based convolution operators for deep neural network training on multicore processors. J Parallel Distrib Comput 167(C):240\u2013254","journal-title":"J Parallel Distrib Comput"},{"key":"5955_CR17","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"5955_CR18","doi-asserted-by":"crossref","unstructured":"Szegedy C, et al (2014) Going deeper with convolutions. CoRR [Online]. Available: arXiv:1409.4842","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"5955_CR19","unstructured":"Chellapilla K, Puri S, Simard P (2006) High performance convolutional neural networks for document processing. In: International Workshop on Frontiers in Handwriting Recognition"},{"key":"5955_CR20","unstructured":"ArmPL: Arm Performance Libraries, https:\/\/developer.arm.com\/downloads\/-\/arm-performance-libraries. Accessed July 2023"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-024-05955-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-024-05955-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-024-05955-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,10]],"date-time":"2024-06-10T07:15:18Z","timestamp":1718003718000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-024-05955-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,12]]},"references-count":20,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["5955"],"URL":"https:\/\/doi.org\/10.1007\/s11227-024-05955-8","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-3388260\/v1","asserted-by":"object"}]},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,3,12]]},"assertion":[{"value":"3 February 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}