/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// GCM acceleration for ARM64 processors /// /// (c) 2019 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software: you can redistribute it and/or modify it /// under the terms of the GNU Library General Public License as published /// by the Free Software Foundation; either version 2 of the License, or /// (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, but /// WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU /// Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb. If not, write to the Free Software /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, /// USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch armv8-a+crypto .text ///-------------------------------------------------------------------------- /// Multiplication macros. // The good news is that we have a fancy instruction to do the // multiplications. The bad news is that it's not particularly well- // suited to the job. // // For one thing, it only does a 64-bit multiplication, so in general // we'll need to synthesize the full-width multiply by hand. For // another thing, it doesn't help with the reduction, so we have to // do that by hand too. And, finally, GCM has crazy bit ordering, // and the instruction does nothing useful for that at all. // // Focusing on that last problem first: the bits aren't in monotonic // significance order unless we permute them. Fortunately, ARM64 has // an instruction which will just permute the bits in each byte for // us, so we don't have to worry about this very much. // // Our main weapons, the `pmull' and `pmull2' instructions, work on // 64-bit operands, in half of a vector register, and produce 128-bit // results. But neither of them will multiply the high half of one // vector by the low half of a second one, so we have a problem, // which we solve by representing one of the operands redundantly: // rather than packing the 64-bit pieces together, we duplicate each // 64-bit piece across both halves of a register. // // The commentary for `mul128' is the most detailed. The other // macros assume that you've already read and understood that. .macro mul128 // Enter with u and v in v0 and v1/v2 respectively, and 0 in v31; // leave with z = u v in v0. Clobbers v1--v6. // First for the double-precision multiplication. It's tempting to // use Karatsuba's identity here, but I suspect that loses more in // the shifting, bit-twiddling, and dependency chains that it gains // in saving a multiplication which otherwise pipelines well. // v0 = // (u_1; u_0) // v1/v2 = // (v_1; v_0) pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0 pmull v4.1q, v0.1d, v2.1d // u_0 v_1 pmull2 v5.1q, v0.2d, v2.2d // (x_3; t_1) = u_1 v_1 pmull v6.1q, v0.1d, v1.1d // (t_0; x_0) = u_0 v_0 // Arrange the pieces to form a double-precision polynomial. eor v3.16b, v3.16b, v4.16b // (m_1; m_0) = u_0 v_1 + u_1 v_0 vshr128 v4, v3, 64 // (0; m_1) vshl128 v3, v3, 64 // (m_0; 0) eor v1.16b, v5.16b, v4.16b // (x_3; x_2) eor v0.16b, v6.16b, v3.16b // (x_1; x_0) // And now the only remaining difficulty is that the result needs to // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 // = t^7 + t^2 + t + 1 in our field. So far, we've calculated z_0 // and z_1 such that z_0 + z_1 R = u v using the identity R = t^128: // now we must collapse the two halves of y together using the other // identity R = t^7 + t^2 + t + 1. // // We do this by working on y_2 and y_3 separately, so consider y_i // for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) = // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that // directly without breaking up the 64-bit word structure. Instead, // we start by considering just y_i t^7 t^{64(i-2)}, which again // looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57; // then // // y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)} // // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the // splits are different. This is lovely, with one small snag: when // we do this to y_3, we end up with a contribution back into the // t^128 coefficient word. But notice that only the low seven bits // of this word are affected, so there's no knock-on contribution // into the t^64 word. Therefore, if we handle the high bits of each // word together, and then the low bits, everything will be fine. // First, shift the high bits down. ushr v2.2d, v1.2d, #63 // the b_i for t ushr v3.2d, v1.2d, #62 // the b_i for t^2 ushr v4.2d, v1.2d, #57 // the b_i for t^7 eor v2.16b, v2.16b, v3.16b // add them all together eor v2.16b, v2.16b, v4.16b vshr128 v3, v2, 64 vshl128 v4, v2, 64 eor v1.16b, v1.16b, v3.16b // contribution into high half eor v0.16b, v0.16b, v4.16b // and low half // And then shift the low bits up. shl v2.2d, v1.2d, #1 shl v3.2d, v1.2d, #2 shl v4.2d, v1.2d, #7 eor v1.16b, v1.16b, v2.16b // unit and t contribs eor v3.16b, v3.16b, v4.16b // t^2 and t^7 contribs eor v0.16b, v0.16b, v1.16b // mix everything together eor v0.16b, v0.16b, v3.16b // ... and we're done .endm .macro mul64 // Enter with u and v in the low halves of v0 and v1, respectively; // leave with z = u v in x2. Clobbers x2--x4. // The multiplication is thankfully easy. // v0 = // (?; u) // v1 = // (?; v) pmull v0.1q, v0.1d, v1.1d // u v // Now we must reduce. This is essentially the same as the 128-bit // case above, but mostly simpler because everything is smaller. The // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1. // Before we get stuck in, transfer the product to general-purpose // registers. mov x3, v0.d[1] mov x2, v0.d[0] // First, shift the high bits down. eor x4, x3, x3, lsr #1 // pre-mix t^3 and t^4 eor x3, x3, x3, lsr #63 // mix in t contribution eor x3, x3, x4, lsr #60 // shift and mix in t^3 and t^4 // And then shift the low bits up. eor x3, x3, x3, lsl #1 // mix unit and t; pre-mix t^3, t^4 eor x2, x2, x3 // fold them in eor x2, x2, x3, lsl #3 // and t^3 and t^4 .endm .macro mul96 // Enter with u in the least-significant 96 bits of v0, with zero in // the upper 32 bits, and with the least-significant 64 bits of v in // both halves of v1, and the upper 32 bits of v in the low 32 bits // of each half of v2, with zero in the upper 32 bits; and with zero // in v31. Yes, that's a bit hairy. Leave with the product u v in // the low 96 bits of v0, and /junk/ in the high 32 bits. Clobbers // v1--v6. // This is an inconvenient size. There's nothing for it but to do // four multiplications, as if for the 128-bit case. It's possible // that there's cruft in the top 32 bits of the input registers, so // shift both of them up by four bytes before we start. This will // mean that the high 64 bits of the result (from GCM's viewpoint) // will be zero. // v0 = // (u_2; u_0 + u_1 t^32) // v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32) // v2 = // (v_2; v_2) pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0 pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1 pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (0; d) pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32 // + u_1 v_1 t^64 = f // Extract the high and low halves of the 192-bit result. The answer // we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96 // bits of the answer will end up in v0, with junk in the top 32 // bits; the high 96 bits will end up in v1, which must have zero in // its top 32 bits. // // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged // in the low 96 bits of a SIMD register, with junk in the top 32 // bits; and top(x) is the high 96 bits, also arranged in the low 96 // bits of a register, with /zero/ in the top 32 bits. eor v4.16b, v4.16b, v5.16b // e_0 + e_1 = e vshl128 v6, v6, 32 // top(d t^128) vshr128 v5, v4, 32 // top(e t^64) vshl128 v4, v4, 64 // bot(e t^64) vshr128 v1, v3, 96 // top(f) eor v6.16b, v6.16b, v5.16b // top(d t^128 + e t^64) eor v0.16b, v3.16b, v4.16b // bot([d t^128] + e t^64 + f) eor v1.16b, v1.16b, v6.16b // top(e t^64 + d t^128 + f) // Finally, the reduction. This is essentially the same as the // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 + // t^9 + t^6 + 1. The degrees are larger but not enough to cause // trouble for the general approach. Unfortunately, we have to do // this in 32-bit pieces rather than 64. // First, shift the high bits down. ushr v2.4s, v1.4s, #26 // the b_i for t^6 ushr v3.4s, v1.4s, #23 // the b_i for t^9 ushr v4.4s, v1.4s, #22 // the b_i for t^10 eor v2.16b, v2.16b, v3.16b // add them all together eor v2.16b, v2.16b, v4.16b vshr128 v3, v2, 64 // contribution for high half vshl128 v2, v2, 32 // contribution for low half eor v1.16b, v1.16b, v3.16b // apply to high half eor v0.16b, v0.16b, v2.16b // and low half // And then shift the low bits up. shl v2.4s, v1.4s, #6 shl v3.4s, v1.4s, #9 shl v4.4s, v1.4s, #10 eor v1.16b, v1.16b, v2.16b // unit and t^6 contribs eor v3.16b, v3.16b, v4.16b // t^9 and t^10 contribs eor v0.16b, v0.16b, v1.16b // mix everything together eor v0.16b, v0.16b, v3.16b // ... and we're done .endm .macro mul192 // Enter with u in v0 and the less-significant half of v1, with v // duplicated across both halves of v2/v3/v4, and with zero in v31. // Leave with the product u v in v0 and the bottom half of v1. // Clobbers v16--v25. // Start multiplying and accumulating pieces of product. // v0 = // (u_1; u_0) // v1 = // (?; u_2) // v2 = // (v_0; v_0) // v3 = // (v_1; v_1) // v4 = // (v_2; v_2) pmull v16.1q, v0.1d, v2.1d // a = u_0 v_0 pmull v19.1q, v0.1d, v3.1d // u_0 v_1 pmull2 v21.1q, v0.2d, v2.2d // u_1 v_0 pmull v17.1q, v0.1d, v4.1d // u_0 v_2 pmull2 v22.1q, v0.2d, v3.2d // u_1 v_1 pmull v23.1q, v1.1d, v2.1d // u_2 v_0 eor v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0 pmull2 v20.1q, v0.2d, v4.2d // u_1 v_2 pmull v24.1q, v1.1d, v3.1d // u_2 v_1 eor v17.16b, v17.16b, v22.16b // u_0 v_2 + u_1 v_1 pmull v18.1q, v1.1d, v4.1d // e = u_2 v_2 eor v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1 eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1 // Piece the product together. // v16 = // (a_1; a_0) // v19 = // (b_1; b_0) // v17 = // (c_1; c_0) // v20 = // (d_1; d_0) // v18 = // (e_1; e_0) vshl128 v21, v19, 64 // (b_0; 0) ext v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1) vshr128 v23, v20, 64 // (0; d_1) eor v16.16b, v16.16b, v21.16b // (x_1; x_0) eor v17.16b, v17.16b, v22.16b // (x_3; x_2) eor v18.16b, v18.16b, v23.16b // (x_5; x_4) // Next, the reduction. Our polynomial this time is p(x) = t^192 + // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the // 128-bit case. I don't know why. // First, shift the high bits down. // v16 = // (y_1; y_0) // v17 = // (y_3; y_2) // v18 = // (y_5; y_4) mov v19.d[0], v17.d[1] // (?; y_3) ushr v23.2d, v18.2d, #63 // hi b_i for t ushr d20, d19, #63 // lo b_i for t ushr v24.2d, v18.2d, #62 // hi b_i for t^2 ushr d21, d19, #62 // lo b_i for t^2 ushr v25.2d, v18.2d, #57 // hi b_i for t^7 ushr d22, d19, #57 // lo b_i for t^7 eor v23.16b, v23.16b, v24.16b // mix them all together eor v20.8b, v20.8b, v21.8b eor v23.16b, v23.16b, v25.16b eor v20.8b, v20.8b, v22.8b // Permute the high pieces while we fold in the b_i. eor v17.16b, v17.16b, v23.16b vshl128 v20, v20, 64 mov v19.d[0], v18.d[1] // (?; y_5) ext v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3) eor v16.16b, v16.16b, v20.16b // And finally shift the low bits up. // v16 = // (y'_1; y'_0) // v17 = // (?; y'_2) // v18 = // (y'_4; y'_3) // v19 = // (?; y'_5) shl v20.2d, v18.2d, #1 shl d23, d19, #1 shl v21.2d, v18.2d, #2 shl d24, d19, #2 shl v22.2d, v18.2d, #7 shl d25, d19, #7 eor v18.16b, v18.16b, v20.16b // unit and t contribs eor v19.8b, v19.8b, v23.8b eor v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs eor v24.8b, v24.8b, v25.8b eor v18.16b, v18.16b, v21.16b // all contribs eor v19.8b, v19.8b, v24.8b eor v0.16b, v16.16b, v18.16b // mix them into the low half eor v1.8b, v17.8b, v19.8b .endm .macro mul256 // Enter with u in v0/v1, with v duplicated across both halves of // v2--v5, and with zero in v31. Leave with the product u v in // v0/v1. Clobbers ???. // Now it's starting to look worthwhile to do Karatsuba. Suppose // u = u_0 + u_1 B and v = v_0 + v_1 B. Then // // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2 // // Name these coefficients of B^i be a, b, and c, respectively, and // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that // // q = r s = (u_0 + u_1) (v_0 + v_1) // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0) // = a + c + b // // The first two terms we've already calculated; the last is the // remaining one we want. We'll set B = t^128. We know how to do // 128-bit multiplications already, and Karatsuba is too annoying // there, so there'll be 12 multiplications altogether, rather than // the 16 we'd have if we did this the naïve way. // v0 = // u_0 = (u_01; u_00) // v1 = // u_1 = (u_11; u_10) // v2 = // (v_00; v_00) // v3 = // (v_01; v_01) // v4 = // (v_10; v_10) // v5 = // (v_11; v_11) eor v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10) eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10 eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11 // Start by building the cross product, q = u_* v_*. pmull v24.1q, v28.1d, v30.1d // u_*0 v_*1 pmull2 v25.1q, v28.2d, v29.2d // u_*1 v_*0 pmull v20.1q, v28.1d, v29.1d // u_*0 v_*0 pmull2 v21.1q, v28.2d, v30.2d // u_*1 v_*1 eor v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0 vshr128 v25, v24, 64 vshl128 v24, v24, 64 eor v20.16b, v20.16b, v24.16b // q_0 eor v21.16b, v21.16b, v25.16b // q_1 // Next, work on the low half, a = u_0 v_0 pmull v24.1q, v0.1d, v3.1d // u_00 v_01 pmull2 v25.1q, v0.2d, v2.2d // u_01 v_00 pmull v16.1q, v0.1d, v2.1d // u_00 v_00 pmull2 v17.1q, v0.2d, v3.2d // u_01 v_01 eor v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00 vshr128 v25, v24, 64 vshl128 v24, v24, 64 eor v16.16b, v16.16b, v24.16b // a_0 eor v17.16b, v17.16b, v25.16b // a_1 // Mix the pieces we have so far. eor v20.16b, v20.16b, v16.16b eor v21.16b, v21.16b, v17.16b // Finally, work on the high half, c = u_1 v_1 pmull v24.1q, v1.1d, v5.1d // u_10 v_11 pmull2 v25.1q, v1.2d, v4.2d // u_11 v_10 pmull v18.1q, v1.1d, v4.1d // u_10 v_10 pmull2 v19.1q, v1.2d, v5.2d // u_11 v_11 eor v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10 vshr128 v25, v24, 64 vshl128 v24, v24, 64 eor v18.16b, v18.16b, v24.16b // c_0 eor v19.16b, v19.16b, v25.16b // c_1 // Finish mixing the product together. eor v20.16b, v20.16b, v18.16b eor v21.16b, v21.16b, v19.16b eor v17.16b, v17.16b, v20.16b eor v18.16b, v18.16b, v21.16b // Now we must reduce. This is essentially the same as the 192-bit // case above, but more complicated because everything is bigger. // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. // v16 = // (y_1; y_0) // v17 = // (y_3; y_2) // v18 = // (y_5; y_4) // v19 = // (y_7; y_6) ushr v24.2d, v18.2d, #62 // (y_5; y_4) b_i for t^2 ushr v25.2d, v19.2d, #62 // (y_7; y_6) b_i for t^2 ushr v26.2d, v18.2d, #59 // (y_5; y_4) b_i for t^5 ushr v27.2d, v19.2d, #59 // (y_7; y_6) b_i for t^5 ushr v28.2d, v18.2d, #54 // (y_5; y_4) b_i for t^10 ushr v29.2d, v19.2d, #54 // (y_7; y_6) b_i for t^10 eor v24.16b, v24.16b, v26.16b // mix the contributions together eor v25.16b, v25.16b, v27.16b eor v24.16b, v24.16b, v28.16b eor v25.16b, v25.16b, v29.16b vshr128 v26, v25, 64 // slide contribs into position ext v25.16b, v24.16b, v25.16b, #8 vshl128 v24, v24, 64 eor v18.16b, v18.16b, v26.16b eor v17.16b, v17.16b, v25.16b eor v16.16b, v16.16b, v24.16b // And then shift the low bits up. // v16 = // (y'_1; y'_0) // v17 = // (y'_3; y'_2) // v18 = // (y'_5; y'_4) // v19 = // (y'_7; y'_6) shl v24.2d, v18.2d, #2 // (y_5; y'_4) a_i for t^2 shl v25.2d, v19.2d, #2 // (y_7; y_6) a_i for t^2 shl v26.2d, v18.2d, #5 // (y_5; y'_4) a_i for t^5 shl v27.2d, v19.2d, #5 // (y_7; y_6) a_i for t^5 shl v28.2d, v18.2d, #10 // (y_5; y'_4) a_i for t^10 shl v29.2d, v19.2d, #10 // (y_7; y_6) a_i for t^10 eor v18.16b, v18.16b, v24.16b // mix the contributions together eor v19.16b, v19.16b, v25.16b eor v26.16b, v26.16b, v28.16b eor v27.16b, v27.16b, v29.16b eor v18.16b, v18.16b, v26.16b eor v19.16b, v19.16b, v27.16b eor v0.16b, v16.16b, v18.16b eor v1.16b, v17.16b, v19.16b .endm ///-------------------------------------------------------------------------- /// Main code. // There are a number of representations of field elements in this code and // it can be confusing. // // * The `external format' consists of a sequence of contiguous bytes in // memory called a `block'. The GCM spec explains how to interpret this // block as an element of a finite field. As discussed extensively, this // representation is very annoying for a number of reasons. On the other // hand, this code never actually deals with it directly. // // * The `register format' consists of one or more SIMD registers, // depending on the block size. The bits in each byte are reversed, // compared to the external format, which makes the polynomials // completely vanilla, unlike all of the other GCM implementations. // // * The `table format' is just like the `register format', only the two // halves of 128-bit SIMD register are the same, so we need twice as many // registers. // // * The `words' format consists of a sequence of bytes, as in the // `external format', but, according to the blockcipher in use, the bytes // within each 32-bit word may be reversed (`big-endian') or not // (`little-endian'). Accordingly, there are separate entry points for // each variant, identified with `b' or `l'. FUNC(gcm_mulk_128b_arm64_pmull) // On entry, x0 points to a 128-bit field element A in big-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr q0, [x0] ldp q1, q2, [x1] rev32 v0.16b, v0.16b vzero rbit v0.16b, v0.16b mul128 rbit v0.16b, v0.16b rev32 v0.16b, v0.16b str q0, [x0] ret ENDFUNC FUNC(gcm_mulk_128l_arm64_pmull) // On entry, x0 points to a 128-bit field element A in little-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr q0, [x0] ldp q1, q2, [x1] vzero rbit v0.16b, v0.16b mul128 rbit v0.16b, v0.16b str q0, [x0] ret ENDFUNC FUNC(gcm_mulk_64b_arm64_pmull) // On entry, x0 points to a 64-bit field element A in big-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr d0, [x0] ldr q1, [x1] rev32 v0.8b, v0.8b rbit v0.8b, v0.8b mul64 rbit x2, x2 ror x2, x2, #32 str x2, [x0] ret ENDFUNC FUNC(gcm_mulk_64l_arm64_pmull) // On entry, x0 points to a 64-bit field element A in little-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr d0, [x0] ldr q1, [x1] rbit v0.8b, v0.8b mul64 rbit x2, x2 rev x2, x2 str x2, [x0] ret ENDFUNC FUNC(gcm_mulk_96b_arm64_pmull) // On entry, x0 points to a 96-bit field element A in big-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr w2, [x0, #8] ldr d0, [x0, #0] mov v0.d[1], x2 ldp q1, q2, [x1] rev32 v0.16b, v0.16b vzero rbit v0.16b, v0.16b mul96 rbit v0.16b, v0.16b rev32 v0.16b, v0.16b mov w2, v0.s[2] str d0, [x0, #0] str w2, [x0, #8] ret ENDFUNC FUNC(gcm_mulk_96l_arm64_pmull) // On entry, x0 points to a 96-bit field element A in little-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr d0, [x0, #0] ldr w2, [x0, #8] mov v0.d[1], x2 ldp q1, q2, [x1] rbit v0.16b, v0.16b vzero mul96 rbit v0.16b, v0.16b mov w2, v0.s[2] str d0, [x0, #0] str w2, [x0, #8] ret ENDFUNC FUNC(gcm_mulk_192b_arm64_pmull) // On entry, x0 points to a 192-bit field element A in big-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr q0, [x0, #0] ldr d1, [x0, #16] ldp q2, q3, [x1, #0] ldr q4, [x1, #32] rev32 v0.16b, v0.16b rev32 v1.8b, v1.8b rbit v0.16b, v0.16b rbit v1.8b, v1.8b vzero mul192 rev32 v0.16b, v0.16b rev32 v1.8b, v1.8b rbit v0.16b, v0.16b rbit v1.8b, v1.8b str q0, [x0, #0] str d1, [x0, #16] ret ENDFUNC FUNC(gcm_mulk_192l_arm64_pmull) // On entry, x0 points to a 192-bit field element A in little-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldr q0, [x0, #0] ldr d1, [x0, #16] ldp q2, q3, [x1, #0] ldr q4, [x1, #32] rbit v0.16b, v0.16b rbit v1.8b, v1.8b vzero mul192 rbit v0.16b, v0.16b rbit v1.8b, v1.8b str q0, [x0, #0] str d1, [x0, #16] ret ENDFUNC FUNC(gcm_mulk_256b_arm64_pmull) // On entry, x0 points to a 256-bit field element A in big-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldp q0, q1, [x0] ldp q2, q3, [x1, #0] ldp q4, q5, [x1, #32] rev32 v0.16b, v0.16b rev32 v1.16b, v1.16b rbit v0.16b, v0.16b rbit v1.16b, v1.16b vzero mul256 rev32 v0.16b, v0.16b rev32 v1.16b, v1.16b rbit v0.16b, v0.16b rbit v1.16b, v1.16b stp q0, q1, [x0] ret ENDFUNC FUNC(gcm_mulk_256l_arm64_pmull) // On entry, x0 points to a 256-bit field element A in little-endian // words format; x1 points to a field-element K in table format. On // exit, A is updated with the product A K. ldp q0, q1, [x0] ldp q2, q3, [x1, #0] ldp q4, q5, [x1, #32] rbit v0.16b, v0.16b rbit v1.16b, v1.16b vzero mul256 rbit v0.16b, v0.16b rbit v1.16b, v1.16b stp q0, q1, [x0] ret ENDFUNC ///----- That's all, folks --------------------------------------------------