mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Fancy SIMD implementation of Salsa20 for ARM
	4	///
	5	/// (c) 2016 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Main.code.
	35
	36	.arch armv7-a
	37	.fpu neon
	38	.text
	39
	40	FUNC(salsa20_core_arm_neon)
	41
	42	// Arguments are in registers.
	43	// r0 is the number of rounds to perform
	44	// r1 points to the input matrix
	45	// r2 points to the output matrix
	46
	47	// First job is to slurp the matrix into the SIMD registers. The
	48	// words have already been permuted conveniently to make them line up
	49	// better for SIMD processing.
	50	//
	51	// The textbook arrangement of the matrix is this.
	52	//
	53	// [C K K K]
	54	// [K C N N]
	55	// [T T C K]
	56	// [K K K C]
	57	//
	58	// But we've rotated the columns up so that the main diagonal with
	59	// the constants on it end up in the first row, giving something more
	60	// like
	61	//
	62	// [C C C C]
	63	// [K T K K]
	64	// [T K K N]
	65	// [K K N K]
	66	//
	67	// so the transformation looks like this:
	68	//
	69	// [ 0 1 2 3] [ 0 5 10 15] (a, q8)
	70	// [ 4 5 6 7] --> [ 4 9 14 3] (b, q9)
	71	// [ 8 9 10 11] [ 8 13 2 7] (c, q10)
	72	// [12 13 14 15] [12 1 6 11] (d, q11)
	73	//
	74	// We need a copy for later. Rather than waste time copying them by
	75	// hand, we'll use the three-address nature of the instruction set.
	76	// But this means that the main loop is offset by a bit.
	77	vldmia r1, {QQ(q12, q15)}
	78
	79	// Apply a column quarterround to each of the columns simultaneously,
	80	// moving the results to their working registers. Alas, there
	81	// doesn't seem to be a packed word rotate, so we have to synthesize
	82	// it.
	83
	84	// b ^= (a + d) <<< 7
	85	vadd.u32 q0, q12, q15
	86	vshl.u32 q1, q0, #7
	87	vshr.u32 q0, q0, #25
	88	vorr q0, q0, q1
	89	veor q9, q13, q0
	90
	91	// c ^= (b + a) <<< 9
	92	vadd.u32 q0, q9, q12
	93	vshl.u32 q1, q0, #9
	94	vshr.u32 q0, q0, #23
	95	vorr q0, q0, q1
	96	veor q10, q14, q0
	97
	98	// d ^= (c + b) <<< 13
	99	vadd.u32 q0, q10, q9
	100	vext.32 q9, q9, q9, #3
	101	vshl.u32 q1, q0, #13
	102	vshr.u32 q0, q0, #19
	103	vorr q0, q0, q1
	104	veor q11, q15, q0
	105
	106	// a ^= (d + c) <<< 18
	107	vadd.u32 q0, q11, q10
	108	vext.32 q10, q10, q10, #2
	109	vext.32 q11, q11, q11, #1
	110	vshl.u32 q1, q0, #18
	111	vshr.u32 q0, q0, #14
	112	vorr q0, q0, q1
	113	veor q8, q12, q0
	114
	115	0:
	116	// The transpose conveniently only involves reordering elements of
	117	// individual rows, which can be done quite easily, and reordering
	118	// the rows themselves, which is a trivial renaming. It doesn't
	119	// involve any movement of elements between rows.
	120	//
	121	// [ 0 5 10 15] [ 0 5 10 15] (a, q8)
	122	// [ 4 9 14 3] --> [ 1 6 11 12] (b, q11)
	123	// [ 8 13 2 7] [ 2 7 8 13] (c, q10)
	124	// [12 1 6 11] [ 3 4 9 14] (d, q9)
	125	//
	126	// The reorderings have been pushed upwards to reduce delays.
	127
	128	// Apply the row quarterround to each of the columns (yes!)
	129	// simultaneously.
	130
	131	// b ^= (a + d) <<< 7
	132	vadd.u32 q0, q8, q9
	133	vshl.u32 q1, q0, #7
	134	vshr.u32 q0, q0, #25
	135	vorr q0, q0, q1
	136	veor q11, q11, q0
	137
	138	// c ^= (b + a) <<< 9
	139	vadd.u32 q0, q11, q8
	140	vshl.u32 q1, q0, #9
	141	vshr.u32 q0, q0, #23
	142	vorr q0, q0, q1
	143	veor q10, q10, q0
	144
	145	// d ^= (c + b) <<< 13
	146	vadd.u32 q0, q10, q11
	147	vext.32 q11, q11, q11, #3
	148	vshl.u32 q1, q0, #13
	149	vshr.u32 q0, q0, #19
	150	vorr q0, q0, q1
	151	veor q9, q9, q0
	152
	153	// a ^= (d + c) <<< 18
	154	vadd.u32 q0, q9, q10
	155	vext.32 q10, q10, q10, #2
	156	vext.32 q9, q9, q9, #1
	157	vshl.u32 q1, q0, #18
	158	vshr.u32 q0, q0, #14
	159	vorr q0, q0, q1
	160	veor q8, q8, q0
	161
	162	// We had to undo the transpose ready for the next loop. Again, push
	163	// back the reorderings to reduce latency. Decrement the loop
	164	// counter and see if we should go round again.
	165	subs r0, r0, #2
	166	bls 9f
	167
	168	// Do the first half of the next round because this loop is offset.
	169
	170	// b ^= (a + d) <<< 7
	171	vadd.u32 q0, q8, q11
	172	vshl.u32 q1, q0, #7
	173	vshr.u32 q0, q0, #25
	174	vorr q0, q0, q1
	175	veor q9, q9, q0
	176
	177	// c ^= (b + a) <<< 9
	178	vadd.u32 q0, q9, q8
	179	vshl.u32 q1, q0, #9
	180	vshr.u32 q0, q0, #23
	181	vorr q0, q0, q1
	182	veor q10, q10, q0
	183
	184	// d ^= (c + b) <<< 13
	185	vadd.u32 q0, q10, q9
	186	vext.32 q9, q9, q9, #3
	187	vshl.u32 q1, q0, #13
	188	vshr.u32 q0, q0, #19
	189	vorr q0, q0, q1
	190	veor q11, q11, q0
	191
	192	// a ^= (d + c) <<< 18
	193	vadd.u32 q0, q11, q10
	194	vext.32 q10, q10, q10, #2
	195	vext.32 q11, q11, q11, #1
	196	vshl.u32 q1, q0, #18
	197	vshr.u32 q0, q0, #14
	198	vorr q0, q0, q1
	199	veor q8, q8, q0
	200
	201	b 0b
	202
	203	// Almost there. Firstly the feedfoward addition. Also, establish a
	204	// constant which will be useful later.
	205	9: vadd.u32 q0, q8, q12 // 0, 5, 10, 15
	206	vmov.i64 q12, #0xffffffff // = (-1, 0, -1, 0)
	207	vadd.u32 q1, q9, q13 // 4, 9, 14, 3
	208	vadd.u32 q2, q10, q14 // 8, 13, 2, 7
	209	vadd.u32 q3, q11, q15 // 12, 1, 6, 11
	210
	211	// Next we must undo the permutation which was already applied to the
	212	// input. The core trick is from Dan Bernstein's `armneon3'
	213	// implementation, but with a lot of liposuction.
	214	vmov q15, q0
	215
	216	// Sort out the columns by pairs.
	217	vbif q0, q3, q12 // 0, 1, 10, 11
	218	vbif q3, q2, q12 // 12, 13, 6, 7
	219	vbif q2, q1, q12 // 8, 9, 2, 3
	220	vbif q1, q15, q12 // 4, 5, 14, 15
	221
	222	// Now fix up the remaining discrepancies.
	223	vswp D1(q0), D1(q2)
	224	vswp D1(q1), D1(q3)
	225
	226	// And with that, we're done.
	227	vstmia r2, {QQ(q0, q3)}
	228	bx r14
	229
	230	ENDFUNC
	231
	232	///----- That's all, folks --------------------------------------------------