diff --git a/btcec.go b/btcec.go index a2a823ba..28822728 100644 --- a/btcec.go +++ b/btcec.go @@ -6,6 +6,10 @@ package btcec +// References: +// [SECG]: Recommended Elliptic Curve Domain Parameters +// http://www.secg.org/download/aid-784/sec2-v2.pdf + // This package operates, internally, on Jacobian coordinates. For a given // (x, y) position on the curve, the Jacobian coordinates are (x1, y1, z1) // where x = x1/z1² and y = y1/z1³. The greatest speedups come when the whole @@ -22,6 +26,13 @@ import ( //TODO: examine if we need to care about EC optimization as descibed here // https://bitcointalk.org/index.php?topic=155054.0;all +var ( + // fieldOne is simple the integer 1 in field representation. It is + // used to avoid needing to create it multiple times during the internal + // arithmetic. + fieldOne = new(fieldVal).SetInt(1) +) + // KoblitzCurve supports a koblitz curve implementation that fits the ECC Curve // interface from crypto/elliptic. type KoblitzCurve struct { @@ -29,200 +40,584 @@ type KoblitzCurve struct { q *big.Int } -// Params returns the parameters fro the curve. +// Params returns the parameters for the curve. func (curve *KoblitzCurve) Params() *elliptic.CurveParams { return curve.CurveParams } +// bigAffineToField takes an affine point (x, y) as big integers and converts +// it to an affine point as field values. +func (curve *KoblitzCurve) bigAffineToField(x, y *big.Int) (*fieldVal, *fieldVal) { + x3, y3 := new(fieldVal), new(fieldVal) + x3.SetByteSlice(x.Bytes()) + y3.SetByteSlice(y.Bytes()) + + return x3, y3 +} + +// fieldJacobianToBigAffine takes a Jacobian point (x, y, z) as field values and +// converts it to an affine point as big integers. +func (curve *KoblitzCurve) fieldJacobianToBigAffine(x, y, z *fieldVal) (*big.Int, *big.Int) { + // Inversions are expensive and both point addition and point doubling + // are faster when working with points that have a z value of one. So, + // if the point needs to be converted to affine, go ahead and normalize + // the point itself at the same time as the calculation is the same. + var zInv, tempZ fieldVal + zInv.Set(z).Inverse() // zInv = Z^-1 + tempZ.SquareVal(&zInv) // tempZ = Z^-2 + x.Mul(&tempZ) // X = X/Z^2 (mag: 1) + y.Mul(tempZ.Mul(&zInv)) // Y = Y/Z^3 (mag: 1) + z.SetInt(1) // Z = 1 (mag: 1) + + // Normalize the x and y values. + x.Normalize() + y.Normalize() + + // Convert the field values for the now affine point to big.Ints. + x3, y3 := new(big.Int), new(big.Int) + x3.SetBytes(x.Bytes()[:]) + y3.SetBytes(y.Bytes()[:]) + return x3, y3 +} + // IsOnCurve returns boolean if the point (x,y) is on the curve. // Part of the elliptic.Curve interface. This function differs from the // crypto/elliptic algorithm since a = 0 not -3. func (curve *KoblitzCurve) IsOnCurve(x, y *big.Int) bool { - // y² = x³ + b - y2 := new(big.Int).Mul(y, y) //y² - y2.Mod(y2, curve.P) //y²%P + // Convert big ints to field values for faster arithmetic. + fx, fy := curve.bigAffineToField(x, y) - x3 := new(big.Int).Mul(x, x) //x² - x3.Mul(x3, x) //x³ - - x3.Add(x3, curve.B) //x³+B - x3.Mod(x3, curve.P) //(x³+B)%P - - return x3.Cmp(y2) == 0 + // Elliptic curve equation for secp256k1 is: y^2 = x^3 + 7 + y2 := new(fieldVal).SquareVal(fy).Normalize() + result := new(fieldVal).SquareVal(fx).Mul(fx).AddInt(7).Normalize() + return y2.Equals(result) } -// zForAffine returns a Jacobian Z value for the affine point (x, y). If x and -// y are zero, it assumes that they represent the point at infinity because (0, -// 0) is not on the any of the curves handled here. -func zForAffine(x, y *big.Int) *big.Int { - z := new(big.Int) - if x.Sign() != 0 || y.Sign() != 0 { - z.SetInt64(1) - } - return z -} +// addZ1AndZ2EqualsOne adds two Jacobian points that are already known to have +// z values of 1 and stores the result in (x3, y3, z3). That is to say +// (x1, y1, 1) + (x2, y2, 1) = (x3, y3, z3). It performs faster addition than +// the generic add routine since less arithmetic is needed due to the ability to +// avoid the z value multiplications. +func (curve *KoblitzCurve) addZ1AndZ2EqualsOne(x1, y1, x2, y2, x3, y3, z3 *fieldVal) { + // To compute the point addition efficiently, this implementation splits + // the equation into intermediate elements which are used to minimize + // the number of field multiplications using the method shown at: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-mmadd-2007-bl + // + // In particular it performs the calculations using the following: + // H = X2-X1, HH = H^2, I = 4*HH, J = H*I, r = 2*(Y2-Y1), V = X1*I + // X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*Y1*J, Z3 = 2*H + // + // This results in a cost of 4 field multiplications, 2 field squarings, + // 6 field additions, and 5 integer multiplications. -// affineFromJacobian reverses the Jacobian transform. See the comment at the -// top of the file. If the point is ∞ it returns 0, 0. -func (curve *KoblitzCurve) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.Int) { - if z.Sign() == 0 { - return new(big.Int), new(big.Int) + // When the x coordinates are the same for two points on the curve, the + // y coordinates either must be the same, in which case it is point + // doubling, or they are opposite and the result is the point at + // infinity per the group law for elliptic curve cryptography. + x1.Normalize() + y1.Normalize() + x2.Normalize() + y2.Normalize() + if x1.Equals(x2) { + if y1.Equals(y2) { + // Since x1 == x2 and y1 == y2, point doubling must be + // done, otherwise the addition would end up dividing + // by zero. + curve.doubleJacobian(x1, y1, fieldOne, x3, y3, z3) + return + } + + // Since x1 == x2 and y1 == -y2, the sum is the point at + // infinity per the group law. + x3.SetInt(0) + y3.SetInt(0) + z3.SetInt(0) + return } - zinv := new(big.Int).ModInverse(z, curve.P) - zinvsq := new(big.Int).Mul(zinv, zinv) + // Calculate X3, Y3, and Z3 according to the intermediate elements + // breakdown above. + var h, i, j, r, v fieldVal + var negJ, neg2V, negX3 fieldVal + h.Set(x1).Negate(1).Add(x2) // H = X2-X1 (mag: 3) + i.SquareVal(&h).MulInt(4) // I = 4*H^2 (mag: 4) + j.Mul2(&h, &i) // J = H*I (mag: 1) + r.Set(y1).Negate(1).Add(y2).MulInt(2) // r = 2*(Y2-Y1) (mag: 6) + v.Mul2(x1, &i) // V = X1*I (mag: 1) + negJ.Set(&j).Negate(1) // negJ = -J (mag: 2) + neg2V.Set(&v).MulInt(2).Negate(2) // neg2V = -(2*V) (mag: 3) + x3.Set(&r).Square().Add(&negJ).Add(&neg2V) // X3 = r^2-J-2*V (mag: 6) + negX3.Set(x3).Negate(6) // negX3 = -X3 (mag: 7) + j.Mul(y1).MulInt(2).Negate(2) // J = -(2*Y1*J) (mag: 3) + y3.Set(&v).Add(&negX3).Mul(&r).Add(&j) // Y3 = r*(V-X3)-2*Y1*J (mag: 4) + z3.Set(&h).MulInt(2) // Z3 = 2*H (mag: 6) - xOut = new(big.Int).Mul(x, zinvsq) - xOut.Mod(xOut, curve.P) - zinvsq.Mul(zinvsq, zinv) - yOut = new(big.Int).Mul(y, zinvsq) - yOut.Mod(yOut, curve.P) - return + // Normalize the resulting field values to a magnitude of 1 as needed. + x3.Normalize() + y3.Normalize() + z3.Normalize() } -// Add returns the sum of (x1,y1 and (x2,y2). Part of the elliptic.Curve -// interface. -func (curve *KoblitzCurve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) { - z1 := zForAffine(x1, y1) - z2 := zForAffine(x2, y2) - return curve.affineFromJacobian(curve.addJacobian(x1, y1, z1, x2, y2, z2)) +// addZ1EqualsZ2 adds two Jacobian points that are already known to have the +// same z value and stores the result in (x3, y3, z3). That is to say +// (x1, y1, z1) + (x2, y2, z1) = (x3, y3, z3). It performs faster addition than +// the generic add routine since less arithmetic is needed due to the known +// equivalence. +func (curve *KoblitzCurve) addZ1EqualsZ2(x1, y1, z1, x2, y2, x3, y3, z3 *fieldVal) { + // To compute the point addition efficiently, this implementation splits + // the equation into intermediate elements which are used to minimize + // the number of field multiplications using a slightly modified version + // of the method shown at: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-mmadd-2007-bl + // + // In particular it performs the calculations using the following: + // A = X2-X1, B = A^2, C=Y2-Y1, D = C^2, E = X1*B, F = X2*B + // X3 = D-E-F, Y3 = C*(E-X3)-Y1*(F-E), Z3 = Z1*A + // + // This results in a cost of 5 field multiplications, 2 field squarings, + // 9 field additions, and 0 integer multiplications. + + // When the x coordinates are the same for two points on the curve, the + // y coordinates either must be the same, in which case it is point + // doubling, or they are opposite and the result is the point at + // infinity per the group law for elliptic curve cryptography. + x1.Normalize() + y1.Normalize() + x2.Normalize() + y2.Normalize() + if x1.Equals(x2) { + if y1.Equals(y2) { + // Since x1 == x2 and y1 == y2, point doubling must be + // done, otherwise the addition would end up dividing + // by zero. + curve.doubleJacobian(x1, y1, z1, x3, y3, z3) + return + } + + // Since x1 == x2 and y1 == -y2, the sum is the point at + // infinity per the group law. + x3.SetInt(0) + y3.SetInt(0) + z3.SetInt(0) + return + } + + // Calculate X3, Y3, and Z3 according to the intermediate elements + // breakdown above. + var a, b, c, d, e, f fieldVal + var negX1, negY1, negE, negX3 fieldVal + negX1.Set(x1).Negate(1) // negX1 = -X1 (mag: 2) + negY1.Set(y1).Negate(1) // negY1 = -Y1 (mag: 2) + a.Set(&negX1).Add(x2) // A = X2-X1 (mag: 3) + b.SquareVal(&a) // B = A^2 (mag: 1) + c.Set(&negY1).Add(y2) // C = Y2-Y1 (mag: 3) + d.SquareVal(&c) // D = C^2 (mag: 1) + e.Mul2(x1, &b) // E = X1*B (mag: 1) + negE.Set(&e).Negate(1) // negE = -E (mag: 2) + f.Mul2(x2, &b) // F = X2*B (mag: 1) + x3.Add2(&e, &f).Negate(3).Add(&d) // X3 = D-E-F (mag: 5) + negX3.Set(x3).Negate(5).Normalize() // negX3 = -X3 (mag: 1) + y3.Set(y1).Mul(f.Add(&negE)).Negate(3) // Y3 = -(Y1*(F-E)) (mag: 4) + y3.Add(e.Add(&negX3).Mul(&c)) // Y3 = C*(E-X3)+Y3 (mag: 5) + z3.Mul2(z1, &a) // Z3 = Z1*A (mag: 1) + + // Normalize the resulting field values to a magnitude of 1 as needed. + x3.Normalize() + y3.Normalize() } -// addJacobian takes two points in Jacobian coordinates, (x1, y1, z1) and -// (x2, y2, z2) and returns their sum, also in Jacobian form. -func (curve *KoblitzCurve) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int, *big.Int, *big.Int) { - // See http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl - x3, y3, z3 := new(big.Int), new(big.Int), new(big.Int) - if z1.Sign() == 0 { +// addZ2EqualsOne adds two Jacobian points when the second point is already +// known to have a z value of 1 (and the z value for the first point is not 1) +// and stores the result in (x3, y3, z3). That is to say (x1, y1, z1) + +// (x2, y2, 1) = (x3, y3, z3). It performs faster addition than the generic +// add routine since less arithmetic is needed due to the ability to avoid +// multiplications by the second point's z value. +func (curve *KoblitzCurve) addZ2EqualsOne(x1, y1, z1, x2, y2, x3, y3, z3 *fieldVal) { + // To compute the point addition efficiently, this implementation splits + // the equation into intermediate elements which are used to minimize + // the number of field multiplications using the method shown at: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + // + // In particular it performs the calculations using the following: + // Z1Z1 = Z1^2, U2 = X2*Z1Z1, S2 = Y2*Z1*Z1Z1, H = U2-X1, HH = H^2, + // I = 4*HH, J = H*I, r = 2*(S2-Y1), V = X1*I + // X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*Y1*J, Z3 = (Z1+H)^2-Z1Z1-HH + // + // This results in a cost of 7 field multiplications, 4 field squarings, + // 9 field additions, and 4 integer multiplications. + + // When the x coordinates are the same for two points on the curve, the + // y coordinates either must be the same, in which case it is point + // doubling, or they are opposite and the result is the point at + // infinity per the group law for elliptic curve cryptography. Since + // any number of Jacobian coordinates can represent the same affine + // point, the x and y values need to be converted to like terms. Due to + // the assumption made for this function that the second point has a z + // value of 1 (z2=1), the first point is already "converted". + var z1z1, u2, s2 fieldVal + x1.Normalize() + y1.Normalize() + z1z1.SquareVal(z1) // Z1Z1 = Z1^2 (mag: 1) + u2.Set(x2).Mul(&z1z1).Normalize() // U2 = X2*Z1Z1 (mag: 1) + s2.Set(y2).Mul(&z1z1).Mul(z1).Normalize() // S2 = Y2*Z1*Z1Z1 (mag: 1) + if x1.Equals(&u2) { + if y1.Equals(&s2) { + // Since x1 == x2 and y1 == y2, point doubling must be + // done, otherwise the addition would end up dividing + // by zero. + curve.doubleJacobian(x1, y1, z1, x3, y3, z3) + return + } + + // Since x1 == x2 and y1 == -y2, the sum is the point at + // infinity per the group law. + x3.SetInt(0) + y3.SetInt(0) + z3.SetInt(0) + return + } + + // Calculate X3, Y3, and Z3 according to the intermediate elements + // breakdown above. + var h, hh, i, j, r, rr, v fieldVal + var negX1, negY1, negX3 fieldVal + negX1.Set(x1).Negate(1) // negX1 = -X1 (mag: 2) + h.Add2(&u2, &negX1) // H = U2-X1 (mag: 3) + hh.SquareVal(&h) // HH = H^2 (mag: 1) + i.Set(&hh).MulInt(4) // I = 4 * HH (mag: 4) + j.Mul2(&h, &i) // J = H*I (mag: 1) + negY1.Set(y1).Negate(1) // negY1 = -Y1 (mag: 2) + r.Set(&s2).Add(&negY1).MulInt(2) // r = 2*(S2-Y1) (mag: 6) + rr.SquareVal(&r) // rr = r^2 (mag: 1) + v.Mul2(x1, &i) // V = X1*I (mag: 1) + x3.Set(&v).MulInt(2).Add(&j).Negate(3) // X3 = -(J+2*V) (mag: 4) + x3.Add(&rr) // X3 = r^2+X3 (mag: 5) + negX3.Set(x3).Negate(5) // negX3 = -X3 (mag: 6) + y3.Set(y1).Mul(&j).MulInt(2).Negate(2) // Y3 = -(2*Y1*J) (mag: 3) + y3.Add(v.Add(&negX3).Mul(&r)) // Y3 = r*(V-X3)+Y3 (mag: 4) + z3.Add2(z1, &h).Square() // Z3 = (Z1+H)^2 (mag: 1) + z3.Add(z1z1.Add(&hh).Negate(2)) // Z3 = Z3-(Z1Z1+HH) (mag: 4) + + // Normalize the resulting field values to a magnitude of 1 as needed. + x3.Normalize() + y3.Normalize() + z3.Normalize() +} + +// addGeneric adds two Jacobian points (x1, y1, z1) and (x2, y2, z2) without any +// assumptions about the z values of the two points and stores the result in +// (x3, y3, z3). That is to say (x1, y1, z1) + (x2, y2, z2) = (x3, y3, z3). It +// is the slowest of the add routines due to requiring the most arithmetic. +func (curve *KoblitzCurve) addGeneric(x1, y1, z1, x2, y2, z2, x3, y3, z3 *fieldVal) { + // To compute the point addition efficiently, this implementation splits + // the equation into intermediate elements which are used to minimize + // the number of field multiplications using the method shown at: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + // + // In particular it performs the calculations using the following: + // Z1Z1 = Z1^2, Z2Z2 = Z2^2, U1 = X1*Z2Z2, U2 = X2*Z1Z1, S1 = Y1*Z2*Z2Z2 + // S2 = Y2*Z1*Z1Z1, H = U2-U1, I = (2*H)^2, J = H*I, r = 2*(S2-S1) + // V = U1*I + // X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*S1*J, Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H + // + // This results in a cost of 11 field multiplications, 5 field squarings, + // 9 field additions, and 4 integer multiplications. + + // When the x coordinates are the same for two points on the curve, the + // y coordinates either must be the same, in which case it is point + // doubling, or they are opposite and the result is the point at + // infinity. Since any number of Jacobian coordinates can represent the + // same affine point, the x and y values need to be converted to like + // terms. + var z1z1, z2z2, u1, u2, s1, s2 fieldVal + z1z1.SquareVal(z1) // Z1Z1 = Z1^2 (mag: 1) + z2z2.SquareVal(z2) // Z2Z2 = Z2^2 (mag: 1) + u1.Set(x1).Mul(&z2z2).Normalize() // U1 = X1*Z2Z2 (mag: 1) + u2.Set(x2).Mul(&z1z1).Normalize() // U2 = X2*Z1Z1 (mag: 1) + s1.Set(y1).Mul(&z2z2).Mul(z2).Normalize() // S1 = Y1*Z2*Z2Z2 (mag: 1) + s2.Set(y2).Mul(&z1z1).Mul(z1).Normalize() // S2 = Y2*Z1*Z1Z1 (mag: 1) + if u1.Equals(&u2) { + if s1.Equals(&s2) { + // Since x1 == x2 and y1 == y2, point doubling must be + // done, otherwise the addition would end up dividing + // by zero. + curve.doubleJacobian(x1, y1, z1, x3, y3, z3) + return + } + + // Since x1 == x2 and y1 == -y2, the sum is the point at + // infinity per the group law. + x3.SetInt(0) + y3.SetInt(0) + z3.SetInt(0) + return + } + + // Calculate X3, Y3, and Z3 according to the intermediate elements + // breakdown above. + var h, i, j, r, rr, v fieldVal + var negU1, negS1, negX3 fieldVal + negU1.Set(&u1).Negate(1) // negU1 = -U1 (mag: 2) + h.Add2(&u2, &negU1) // H = U2-U1 (mag: 3) + i.Set(&h).MulInt(2).Square() // I = (2*H)^2 (mag: 2) + j.Mul2(&h, &i) // J = H*I (mag: 1) + negS1.Set(&s1).Negate(1) // negS1 = -S1 (mag: 2) + r.Set(&s2).Add(&negS1).MulInt(2) // r = 2*(S2-S1) (mag: 6) + rr.SquareVal(&r) // rr = r^2 (mag: 1) + v.Mul2(&u1, &i) // V = U1*I (mag: 1) + x3.Set(&v).MulInt(2).Add(&j).Negate(3) // X3 = -(J+2*V) (mag: 4) + x3.Add(&rr) // X3 = r^2+X3 (mag: 5) + negX3.Set(x3).Negate(5) // negX3 = -X3 (mag: 6) + y3.Mul2(&s1, &j).MulInt(2).Negate(2) // Y3 = -(2*S1*J) (mag: 3) + y3.Add(v.Add(&negX3).Mul(&r)) // Y3 = r*(V-X3)+Y3 (mag: 4) + z3.Add2(z1, z2).Square() // Z3 = (Z1+Z2)^2 (mag: 1) + z3.Add(z1z1.Add(&z2z2).Negate(2)) // Z3 = Z3-(Z1Z1+Z2Z2) (mag: 4) + z3.Mul(&h) // Z3 = Z3*H (mag: 1) + + // Normalize the resulting field values to a magnitude of 1 as needed. + x3.Normalize() + y3.Normalize() +} + +// addJacobian adds the passed Jacobian points (x1, y1, z1) and (x2, y2, z2) +// together and stores the result in (x3, y3, z3). +func (curve *KoblitzCurve) addJacobian(x1, y1, z1, x2, y2, z2, x3, y3, z3 *fieldVal) { + // A point at infinity is the identity according to the group law for + // elliptic curve cryptography. Thus, ∞ + P = P and P + ∞ = P. + if (x1.IsZero() && y1.IsZero()) || z1.IsZero() { x3.Set(x2) y3.Set(y2) z3.Set(z2) - return x3, y3, z3 + return } - if z2.Sign() == 0 { + if (x2.IsZero() && y2.IsZero()) || z2.IsZero() { x3.Set(x1) y3.Set(y1) z3.Set(z1) - return x3, y3, z3 + return } - z1z1 := new(big.Int).Mul(z1, z1) - z1z1.Mod(z1z1, curve.P) - z2z2 := new(big.Int).Mul(z2, z2) - z2z2.Mod(z2z2, curve.P) - - u1 := new(big.Int).Mul(x1, z2z2) - u1.Mod(u1, curve.P) - u2 := new(big.Int).Mul(x2, z1z1) - u2.Mod(u2, curve.P) - h := new(big.Int).Sub(u2, u1) - xEqual := h.Sign() == 0 - if h.Sign() == -1 { - h.Add(h, curve.P) + // Faster point addition can be achieved when certain assumptions are + // met. For example, when both points have the same z value, arithmetic + // on the z values can be avoided. This section thus checks for these + // conditions and calls an appropriate add function which is accelerated + // by using those assumptions. + z1.Normalize() + z2.Normalize() + isZ1One := z1.Equals(fieldOne) + isZ2One := z2.Equals(fieldOne) + switch { + case isZ1One && isZ2One: + curve.addZ1AndZ2EqualsOne(x1, y1, x2, y2, x3, y3, z3) + return + case z1.Equals(z2): + curve.addZ1EqualsZ2(x1, y1, z1, x2, y2, x3, y3, z3) + return + case isZ2One: + curve.addZ2EqualsOne(x1, y1, z1, x2, y2, x3, y3, z3) + return } - i := new(big.Int).Lsh(h, 1) - i.Mul(i, i) - j := new(big.Int).Mul(h, i) - s1 := new(big.Int).Mul(y1, z2) - s1.Mul(s1, z2z2) - s1.Mod(s1, curve.P) - s2 := new(big.Int).Mul(y2, z1) - s2.Mul(s2, z1z1) - s2.Mod(s2, curve.P) - r := new(big.Int).Sub(s2, s1) - if r.Sign() == -1 { - r.Add(r, curve.P) + // None of the above assumptions are true, so fall back to generic + // point addition. + curve.addGeneric(x1, y1, z1, x2, y2, z2, x3, y3, z3) +} + +// Add returns the sum of (x1,y1) and (x2,y2). Part of the elliptic.Curve +// interface. +func (curve *KoblitzCurve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) { + // A point at infinity is the identity according to the group law for + // elliptic curve cryptography. Thus, ∞ + P = P and P + ∞ = P. + if x1.Sign() == 0 && y1.Sign() == 0 { + return x2, y2 } - yEqual := r.Sign() == 0 - if xEqual && yEqual { - return curve.doubleJacobian(x1, y1, z1) + if x2.Sign() == 0 && y2.Sign() == 0 { + return x1, y1 } - r.Lsh(r, 1) - v := new(big.Int).Mul(u1, i) - x3.Set(r) - x3.Mul(x3, x3) - x3.Sub(x3, j) - x3.Sub(x3, v) - x3.Sub(x3, v) - x3.Mod(x3, curve.P) + // Convert the affine coordinates from big integers to field values + // and do the point addition in Jacobian projective space. + fx1, fy1 := curve.bigAffineToField(x1, y1) + fx2, fy2 := curve.bigAffineToField(x2, y2) + fx3, fy3, fz3 := new(fieldVal), new(fieldVal), new(fieldVal) + curve.addJacobian(fx1, fy1, fieldOne, fx2, fy2, fieldOne, fx3, fy3, fz3) - y3.Set(r) - v.Sub(v, x3) - y3.Mul(y3, v) - s1.Mul(s1, j) - s1.Lsh(s1, 1) - y3.Sub(y3, s1) - y3.Mod(y3, curve.P) + // Convert the Jacobian coordinate field values back to affine big + // integers. + return curve.fieldJacobianToBigAffine(fx3, fy3, fz3) +} - z3.Add(z1, z2) - z3.Mul(z3, z3) - z3.Sub(z3, z1z1) - z3.Sub(z3, z2z2) - z3.Mul(z3, h) - z3.Mod(z3, curve.P) +// doubleZ1EqualsOne performs point doubling on the passed Jacobian point +// when the point is already known to have a z value of 1 and stores +// the result in (x3, y3, z3). That is to say (x3, y3, z3) = 2*(x1, y1, 1). It +// performs faster point doubling than the generic routine since less arithmetic +// is needed due to the ability to avoid multiplication by the z value. +func (curve *KoblitzCurve) doubleZ1EqualsOne(x1, y1, x3, y3, z3 *fieldVal) { + // This function uses the assumptions that z1 is 1, thus the point + // doubling formulas reduce to: + // + // X3 = (3*X1^2)^2 - 8*X1*Y1^2 + // Y3 = (3*X1^2)*(4*X1*Y1^2 - X3) - 8*Y1^4 + // Z3 = 2*Y1 + // + // To compute the above efficiently, this implementation splits the + // equation into intermediate elements which are used to minimize the + // number of field multiplications in favor of field squarings which + // are roughly 35% faster than field multiplications with the current + // implementation at the time this was written. + // + // This uses a slightly modified version of the method shown at: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-mdbl-2007-bl + // + // In particular it performs the calculations using the following: + // A = X1^2, B = Y1^2, C = B^2, D = 2*((X1+B)^2-A-C) + // E = 3*A, F = E^2, X3 = F-2*D, Y3 = E*(D-X3)-8*C + // Z3 = 2*Y1 + // + // This results in a cost of 1 field multiplication, 5 field squarings, + // 6 field additions, and 5 integer multiplications. + var a, b, c, d, e, f fieldVal + z3.Set(y1).MulInt(2) // Z3 = 2*Y1 (mag: 2) + a.SquareVal(x1) // A = X1^2 (mag: 1) + b.SquareVal(y1) // B = Y1^2 (mag: 1) + c.SquareVal(&b) // C = B^2 (mag: 1) + b.Add(x1).Square() // B = (X1+B)^2 (mag: 1) + d.Set(&a).Add(&c).Negate(2) // D = -(A+C) (mag: 3) + d.Add(&b).MulInt(2) // D = 2*(B+D)(mag: 8) + e.Set(&a).MulInt(3) // E = 3*A (mag: 3) + f.SquareVal(&e) // F = E^2 (mag: 1) + x3.Set(&d).MulInt(2).Negate(16) // X3 = -(2*D) (mag: 17) + x3.Add(&f) // X3 = F+X3 (mag: 18) + f.Set(x3).Negate(18).Add(&d).Normalize() // F = D-X3 (mag: 1) + y3.Set(&c).MulInt(8).Negate(8) // Y3 = -(8*C) (mag: 9) + y3.Add(f.Mul(&e)) // Y3 = E*F+Y3 (mag: 10) - return x3, y3, z3 + // Normalize the field values back to a magnitude of 1. + x3.Normalize() + y3.Normalize() + z3.Normalize() +} + +// doubleGeneric performs point doubling on the passed Jacobian point without +// any assumptions about the z value and stores the result in (x3, y3, z3). +// That is to say (x3, y3, z3) = 2*(x1, y1, z1). It is the slowest of the point +// doubling routines due to requiring the most arithmetic. +func (cuve *KoblitzCurve) doubleGeneric(x1, y1, z1, x3, y3, z3 *fieldVal) { + // Point doubling formula for Jacobian coordinates for the secp256k1 + // curve: + // X3 = (3*X1^2)^2 - 8*X1*Y1^2 + // Y3 = (3*X1^2)*(4*X1*Y1^2 - X3) - 8*Y1^4 + // Z3 = 2*Y1*Z1 + // + // To compute the above efficiently, this implementation splits the + // equation into intermediate elements which are used to minimize the + // number of field multiplications in favor of field squarings which + // are roughly 35% faster than field multiplications with the current + // implementation at the time this was written. + // + // This uses a slightly modified version of the method shown at: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + // + // In particular it performs the calculations using the following: + // A = X1^2, B = Y1^2, C = B^2, D = 2*((X1+B)^2-A-C) + // E = 3*A, F = E^2, X3 = F-2*D, Y3 = E*(D-X3)-8*C + // Z3 = 2*Y1*Z1 + // + // This results in a cost of 1 field multiplication, 5 field squarings, + // 6 field additions, and 5 integer multiplications. + var a, b, c, d, e, f fieldVal + z3.Mul2(y1, z1).MulInt(2) // Z3 = 2*Y1*Z1 (mag: 2) + a.SquareVal(x1) // A = X1^2 (mag: 1) + b.SquareVal(y1) // B = Y1^2 (mag: 1) + c.SquareVal(&b) // C = B^2 (mag: 1) + b.Add(x1).Square() // B = (X1+B)^2 (mag: 1) + d.Set(&a).Add(&c).Negate(2) // D = -(A+C) (mag: 3) + d.Add(&b).MulInt(2) // D = 2*(B+D)(mag: 8) + e.Set(&a).MulInt(3) // E = 3*A (mag: 3) + f.SquareVal(&e) // F = E^2 (mag: 1) + x3.Set(&d).MulInt(2).Negate(16) // X3 = -(2*D) (mag: 17) + x3.Add(&f) // X3 = F+X3 (mag: 18) + f.Set(x3).Negate(18).Add(&d).Normalize() // F = D-X3 (mag: 1) + y3.Set(&c).MulInt(8).Negate(8) // Y3 = -(8*C) (mag: 9) + y3.Add(f.Mul(&e)) // Y3 = E*F+Y3 (mag: 10) + + // Normalize the field values back to a magnitude of 1. + x3.Normalize() + y3.Normalize() + z3.Normalize() +} + +// doubleJacobian doubles the passed Jacobian point (x1, y1, z1) and stores the +// result in (x3, y3, z3). +func (curve *KoblitzCurve) doubleJacobian(x1, y1, z1, x3, y3, z3 *fieldVal) { + // Doubling a point at infinity is still infinity. + if y1.IsZero() || z1.IsZero() { + x3.SetInt(0) + y3.SetInt(0) + z3.SetInt(0) + return + } + + // Slightly faster point doubling can be achieved when the z value is 1 + // by avoiding the multiplication on the z value. This section calls + // a point doubling function which is accelerated by using that + // assumption when possible. + if z1.Normalize().Equals(fieldOne) { + curve.doubleZ1EqualsOne(x1, y1, x3, y3, z3) + return + } + + // Fall back to generic point doubling which works with arbitrary z + // values. + curve.doubleGeneric(x1, y1, z1, x3, y3, z3) } // Double returns 2*(x1,y1). Part of the elliptic.Curve interface. func (curve *KoblitzCurve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) { - z1 := zForAffine(x1, y1) - return curve.affineFromJacobian(curve.doubleJacobian(x1, y1, z1)) -} + if y1.Sign() == 0 { + return new(big.Int), new(big.Int) + } -// doubleJacobian takes a point in Jacobian coordinates, (x, y, z), and -// returns its double, also in Jacobian form. -func (curve *KoblitzCurve) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int, *big.Int) { - // See http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + // Convert the affine coordinates from big integers to field values + // and do the point doubling in Jacobian projective space. + fx1, fy1 := curve.bigAffineToField(x1, y1) + fx3, fy3, fz3 := new(fieldVal), new(fieldVal), new(fieldVal) + curve.doubleJacobian(fx1, fy1, fieldOne, fx3, fy3, fz3) - a := new(big.Int).Mul(x, x) //X1² - b := new(big.Int).Mul(y, y) //Y1² - c := new(big.Int).Mul(b, b) //B² - - d := new(big.Int).Add(x, b) //X1+B - d.Mul(d, d) //(X1+B)² - d.Sub(d, a) //(X1+B)²-A - d.Sub(d, c) //(X1+B)²-A-C - d.Mul(d, big.NewInt(2)) //2*((X1+B)²-A-C) - - e := new(big.Int).Mul(big.NewInt(3), a) //3*A - f := new(big.Int).Mul(e, e) //E² - - x3 := new(big.Int).Mul(big.NewInt(2), d) //2*D - x3.Sub(f, x3) //F-2*D - x3.Mod(x3, curve.P) - - y3 := new(big.Int).Sub(d, x3) //D-X3 - y3.Mul(e, y3) //E*(D-X3) - y3.Sub(y3, new(big.Int).Mul(big.NewInt(8), c)) //E*(D-X3)-8*C - y3.Mod(y3, curve.P) - - z3 := new(big.Int).Mul(y, z) //Y1*Z1 - z3.Mul(big.NewInt(2), z3) //3*Y1*Z1 - z3.Mod(z3, curve.P) - - return x3, y3, z3 + // Convert the Jacobian coordinate field values back to affine big + // integers. + return curve.fieldJacobianToBigAffine(fx3, fy3, fz3) } // ScalarMult returns k*(Bx, By) where k is a big endian integer. // Part of the elliptic.Curve interface. func (curve *KoblitzCurve) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) { - Bz := new(big.Int).SetInt64(1) - x, y, z := new(big.Int), new(big.Int), new(big.Int) + // This uses the left to right binary method for point multiplication: - for _, byte := range k { + // Point Q = ∞ (point at infinity). + qx, qy, qz := new(fieldVal), new(fieldVal), new(fieldVal) + + // Point P = the point to multiply the scalar with. + px, py := curve.bigAffineToField(Bx, By) + pz := fieldOne + + // Double and add as necessary depending on the bits set in the scalar. + for _, byteVal := range k { for bitNum := 0; bitNum < 8; bitNum++ { - x, y, z = curve.doubleJacobian(x, y, z) - if byte&0x80 == 0x80 { - x, y, z = curve.addJacobian(Bx, By, Bz, x, y, z) + // Q = 2*Q + curve.doubleJacobian(qx, qy, qz, qx, qy, qz) + if byteVal&0x80 == 0x80 { + // Q = Q + P + curve.addJacobian(qx, qy, qz, px, py, pz, qx, + qy, qz) } - byte <<= 1 + byteVal <<= 1 } } - return curve.affineFromJacobian(x, y, z) + // Convert the Jacobian coordinate field values back to affine big.Ints. + return curve.fieldJacobianToBigAffine(qx, qy, qz) } // ScalarBaseMult returns k*G where G is the base point of the group and k is a @@ -247,7 +642,7 @@ func initAll() { } func initS256() { - // See SEC 2 section 2.7.1 + // See [SECG] section 2.7.1 secp256k1.CurveParams = new(elliptic.CurveParams) secp256k1.P, _ = new(big.Int).SetString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F", 16) secp256k1.N, _ = new(big.Int).SetString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141", 16) diff --git a/field.go b/field.go new file mode 100644 index 00000000..ac123e9e --- /dev/null +++ b/field.go @@ -0,0 +1,1270 @@ +// Copyright (c) 2013 Conformal Systems LLC. +// Copyright (c) 2013 Dave Collins +// Use of this source code is governed by an ISC +// license that can be found in the LICENSE file. + +package btcec + +// References: +// [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone. +// http://cacr.uwaterloo.ca/hac/ + +// All elliptic curve operations for secp256k1 are done in a finite field +// characterized by a 256-bit prime. Given this precision is larger than the +// biggest available native type, obviously some form of bignum math is needed. +// This package implements specialized fixed-precision field arithmetic rather +// than relying on an arbitrary-precision arithmetic package such as math/big +// for dealing with the field math since the size is known. As a result, rather +// large performance gains are achieved by taking advantage of many +// optimizations not available to arbitrary-precision arithmetic and generic +// modular arithmetic algorithms. +// +// There are various ways to internally represent each finite field element. +// For example, the most obvious representation would be to use an array of 4 +// uint64s (64 bits * 4 = 256 bits). However, that representation suffers from +// a couple of issues. First, there is no native Go type large enough to handle +// the intermediate results while adding or multiplying two 64-bit numbers, and +// second there is no space left for overflows when performing the intermediate +// arithmetic between each array element which would lead to expensive carry +// propagation. +// +// Given the above, this implementation represents the the field elements as +// 10 uint32s with each word (array entry) treated as base 2^26. This was +// chosen for the following reasons: +// 1) Most systems at the current time are 64-bit (or at least have 64-bit +// registers available for specialized purposes such as MMX) so the +// intermediate results can typically be done using a native register (and +// using uint64s to avoid the need for additional half-word arithmetic) +// 2) In order to allow addition of the internal words without having to +// propagate the the carry, the max normalized value for each register must +// be less than the number of bits available in the register +// 3) Since we're dealing with 32-bit values, 64-bits of overflow is a +// reasonable choice for #2 +// 4) Given the need for 256-bits of precision and the properties stated in #1, +// #2, and #3, the representation which best accomodates this is 10 uint32s +// with base 2^26 (26 bits * 10 = 260 bits, so the final word only needs 22 +// bits) which leaves the desired 64 bits (32 * 10 = 320, 320 - 256 = 64) for +// overflow +// +// Since it is so important that the field arithmetic is extremely fast for +// high performance crypto, this package does not perform any validation where +// it ordinarily would. For example, some functions only give the correct +// result is the field is normalized and there is no checking to ensure it is. +// While I typically prefer to ensure all state and input is valid for most +// packages, this code is really only used internally and every extra check +// counts. + +import ( + "encoding/hex" +) + +// Constants used to make the code more readable. +const ( + twoBitsMask = 0x3 + fourBitsMask = 0xf + sixBitsMask = 0x3f + eightBitsMask = 0xff +) + +// Constants related to the field representation. +const ( + // fieldWords is the number of words used to internally represent the + // 256-bit value. + fieldWords = 10 + + // fieldBase is the exponent used to form the numeric base of each word. + // 2^(fieldBase*i) where i is the word position. + fieldBase = 26 + + // fieldOverflowBits is the minimum number of "overflow" bits for each + // word in the field value. + fieldOverflowBits = 32 - fieldBase + + // fieldBaseMask is the mask for the bits in each word needed to + // represent the numeric base of each word (except the most significant + // word). + fieldBaseMask = (1 << fieldBase) - 1 + + // fieldMSBBits is the number of bits in the most significant word used + // to represent the value. + fieldMSBBits = 256 - (fieldBase * (fieldWords - 1)) + + // fieldMSBMask is the mask for the bits in the most significant word + // needed to represent the value. + fieldMSBMask = (1 << fieldMSBBits) - 1 + + // fieldPrimeWordZero is word zero of the secp256k1 prime in the + // internal field representation. It is used during modular reduction + // and negation. + fieldPrimeWordZero = 0x3fffc2f + + // fieldPrimeWordOne is word one of the secp256k1 prime in the + // internal field representation. It is used during modular reduction + // and negation. + fieldPrimeWordOne = 0x3ffffbf +) + +// fieldVal implements optimized fixed-precision arithmetic over the +// secp256k1 finite field. This means all arithmetic is performed modulo +// 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f. It +// represents each 256-bit value as 10 32-bit integers in base 2^26. This +// provides 6 bits of overflow in each word (10 bits in the most significant +// word) for a total of 64 bits of overflow (9*6 + 10 = 64). It only implements +// the arithmetic needed for elliptic curve operations. +// +// The following depicts the internal representation: +// ----------------------------------------------------------------- +// | n[9] | n[8] | ... | n[0] | +// | 32 bits available | 32 bits available | ... | 32 bits available | +// | 22 bits for value | 26 bits for value | ... | 26 bits for value | +// | 10 bits overflow | 6 bits overflow | ... | 6 bits overflow | +// | Mult: 2^(26*9) | Mult: 2^(26*8) | ... | Mult: 2^(26*0) | +// ----------------------------------------------------------------- +// +// For example, consider the number 2^49 + 1. It would be represented as: +// n[0] = 1 +// n[1] = 2^23 +// n[2..9] = 0 +// +// The full 256-bit value is then calculated by looping i from 9..0 and +// doing sum(n[i] * 2^(26i)) like so: +// n[9] * 2^(26*9) = 0 * 2^234 = 0 +// n[8] * 2^(26*8) = 0 * 2^208 = 0 +// ... +// n[1] * 2^(26*1) = 2^23 * 2^26 = 2^49 +// n[0] * 2^(26*0) = 1 * 2^0 = 1 +// Sum: 0 + 0 + ... + 2^49 + 1 = 2^49 + 1 +type fieldVal struct { + n [10]uint32 +} + +// String returns the field value as a human-readable hex string. +func (f fieldVal) String() string { + t := new(fieldVal).Set(&f).Normalize() + return hex.EncodeToString(t.Bytes()[:]) +} + +// Zero sets the field value to zero. A newly created field value is already +// set to zero. This function can be useful to clear an existing field value +// for reuse. +func (f *fieldVal) Zero() { + f.n[0] = 0 + f.n[1] = 0 + f.n[2] = 0 + f.n[3] = 0 + f.n[4] = 0 + f.n[5] = 0 + f.n[6] = 0 + f.n[7] = 0 + f.n[8] = 0 + f.n[9] = 0 +} + +// Set sets the field value equal to the passed value. +// +// The field value is returned to support chaining. This enables syntax like: +// f := new(fieldVal).Set(f2).Add(1) so that f = f2 + 1 where f2 is not +// modified. +func (f *fieldVal) Set(val *fieldVal) *fieldVal { + *f = *val + return f +} + +// SetInt sets the field value to the passed integer. This is a convenience +// function since it is fairly common to perform some arithemetic with small +// native integers. +// +// The field value is returned to support chaining. This enables syntax such +// as f := new(fieldVal).SetInt(2).Mul(f2) so that f = 2 * f2. +func (f *fieldVal) SetInt(ui uint) *fieldVal { + f.Zero() + f.n[0] = uint32(ui) + return f +} + +// SetBytes packs the passed 32-byte big-endian value into the internal field +// value representation. +// +// The field value is returned to support chaining. This enables syntax like: +// f := new(fieldVal).SetBytes(byteArray).Mul(f2) so that f = ba * f2. +func (f *fieldVal) SetBytes(b *[32]byte) *fieldVal { + // Pack the 256 total bits across the 10 uint32 words with a max of + // 26-bits per word. This could be done with a couple of for loops, + // but this unrolled version is significantly faster. Benchmarks show + // this is about 34 times faster than the variant which uses loops. + f.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | + (uint32(b[28])&twoBitsMask)<<24 + f.n[1] = uint32(b[28])>>2 | uint32(b[27])<<6 | uint32(b[26])<<14 | + (uint32(b[25])&fourBitsMask)<<22 + f.n[2] = uint32(b[25])>>4 | uint32(b[24])<<4 | uint32(b[23])<<12 | + (uint32(b[22])&sixBitsMask)<<20 + f.n[3] = uint32(b[22])>>6 | uint32(b[21])<<2 | uint32(b[20])<<10 | + uint32(b[19])<<18 + f.n[4] = uint32(b[18]) | uint32(b[17])<<8 | uint32(b[16])<<16 | + (uint32(b[15])&twoBitsMask)<<24 + f.n[5] = uint32(b[15])>>2 | uint32(b[14])<<6 | uint32(b[13])<<14 | + (uint32(b[12])&fourBitsMask)<<22 + f.n[6] = uint32(b[12])>>4 | uint32(b[11])<<4 | uint32(b[10])<<12 | + (uint32(b[9])&sixBitsMask)<<20 + f.n[7] = uint32(b[9])>>6 | uint32(b[8])<<2 | uint32(b[7])<<10 | + uint32(b[6])<<18 + f.n[8] = uint32(b[5]) | uint32(b[4])<<8 | uint32(b[3])<<16 | + (uint32(b[2])&twoBitsMask)<<24 + f.n[9] = uint32(b[2])>>2 | uint32(b[1])<<6 | uint32(b[0])<<14 + return f +} + +// SetByteSlice packs the passed big-endian value into the internal field value +// representation. Only the first 32-bytes are used. As a result, it is up to +// the caller to ensure numbers of the appropriate size are used or the value +// will be truncated. +// +// The field value is returned to support chaining. This enables syntax like: +// f := new(fieldVal).SetByteSlice(byteSlice) +func (f *fieldVal) SetByteSlice(b []byte) *fieldVal { + var b32 [32]byte + for i := 0; i < len(b); i++ { + if i < 32 { + b32[i+(32-len(b))] = b[i] + } + } + return f.SetBytes(&b32) +} + +// SetHex decodes the passed big-endian hex string into the internal field value +// representation. Only the first 32-bytes are used. +// +// The field value is returned to support chaining. This enables syntax like: +// f := new(fieldVal).SetHex("0abc").Add(1) so that f = 0x0abc + 1 +func (f *fieldVal) SetHex(hexString string) *fieldVal { + if len(hexString)%2 != 0 { + hexString = "0" + hexString + } + bytes, _ := hex.DecodeString(hexString) + return f.SetByteSlice(bytes) +} + +// Normalize normalizes the internal field words into the desired range and +// performs fast modular reduction over the secp256k1 prime by making use of the +// special form of the prime. +func (f *fieldVal) Normalize() *fieldVal { + // The field representation leaves 6 bits of overflow in each + // word so intermediate calculations can be performed without needing + // to propagate the carry to each higher word during the calculations. + // In order to normalize, first we need to "compact" the full 256-bit + // value to the right and treat the additional 64 leftmost bits as + // the magnitude. + m := f.n[0] + t0 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[1] + t1 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[2] + t2 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[3] + t3 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[4] + t4 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[5] + t5 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[6] + t6 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[7] + t7 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[8] + t8 := m & fieldBaseMask + m = (m >> fieldBase) + f.n[9] + t9 := m & fieldMSBMask + m = m >> fieldMSBBits + + // At this point, if the magnitude is greater than 0, the overall value + // is greater than the max possible 256-bit value. In particular, it is + // "how many times larger" than the max value it is. Since this field + // is doing arithmetic modulo the secp256k1 prime, we need to perform + // modular reduction over the prime. + // + // Per [HAC] section 14.3.4: Reduction method of moduli of special form, + // when the modulus is of the special form m = b^t - c, highly efficient + // reduction can be achieved. + // + // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits + // this criteria. + // + // 4294968273 in field representation (base 2^26) is: + // n[0] = 977 + // n[1] = 64 + // That is to say (2^26 * 64) + 977 = 4294968273 + // + // The algorithm presented in the referenced section typically repeats + // until the quotient is zero. However, due to our field representation + // we already know at least how many times we would need to repeat as + // it's the value currently in m. Thus we can simply multiply the + // magnitude by the field representation of the prime and do a single + // iteration. Notice that nothing will be changed when the magnitude is + // zero, so we could skip this in that case, however always running + // regardless allows it to run in constant time. + r := t0 + m*977 + t0 = r & fieldBaseMask + r = (r >> fieldBase) + t1 + m*64 + t1 = r & fieldBaseMask + r = (r >> fieldBase) + t2 + t2 = r & fieldBaseMask + r = (r >> fieldBase) + t3 + t3 = r & fieldBaseMask + r = (r >> fieldBase) + t4 + t4 = r & fieldBaseMask + r = (r >> fieldBase) + t5 + t5 = r & fieldBaseMask + r = (r >> fieldBase) + t6 + t6 = r & fieldBaseMask + r = (r >> fieldBase) + t7 + t7 = r & fieldBaseMask + r = (r >> fieldBase) + t8 + t8 = r & fieldBaseMask + r = (r >> fieldBase) + t9 + t9 = r & fieldMSBMask + + // At this point, the result will be in the range 0 <= result <= + // prime + (2^64 - c). Therefore, one more subtraction of the prime + // might be needed if the current result is greater than or equal to the + // prime. The following does the final reduction in constant time. + // Note that the if/else here intentionally does the bitwise OR with + // zero even though it won't change the value to ensure constant time + // between the branches. + var mask int32 + if t0 < fieldPrimeWordZero { + mask |= -1 + } else { + mask |= 0 + } + if t1 < fieldPrimeWordOne { + mask |= -1 + } else { + mask |= 0 + } + if t2 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t3 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t4 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t5 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t6 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t7 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t8 < fieldBaseMask { + mask |= -1 + } else { + mask |= 0 + } + if t9 < fieldMSBMask { + mask |= -1 + } else { + mask |= 0 + } + t0 = t0 - uint32(^mask&fieldPrimeWordZero) + t1 = t1 - uint32(^mask&fieldPrimeWordOne) + t2 = t2 & uint32(mask) + t3 = t3 & uint32(mask) + t4 = t4 & uint32(mask) + t5 = t5 & uint32(mask) + t6 = t6 & uint32(mask) + t7 = t7 & uint32(mask) + t8 = t8 & uint32(mask) + t9 = t9 & uint32(mask) + + // Finally, set the normalized and reduced words. + f.n[0] = t0 + f.n[1] = t1 + f.n[2] = t2 + f.n[3] = t3 + f.n[4] = t4 + f.n[5] = t5 + f.n[6] = t6 + f.n[7] = t7 + f.n[8] = t8 + f.n[9] = t9 + return f +} + +// PutBytes unpacks the field value to a 32-byte big-endian value using the +// passed byte array. There is a similar function, Bytes, which unpacks the +// field value into a new array and returns that. This version is provided +// since it can be useful to cut down on the number of allocations by allowing +// the caller to reuse a buffer. +// +// The field value must be normalized for this function to return the correct +// result. +func (f *fieldVal) PutBytes(b *[32]byte) { + // Unpack the 256 total bits from the 10 uint32 words with a max of + // 26-bits per word. This could be done with a couple of for loops, + // but this unrolled version is a bit faster. Benchmarks show this is + // about 10 times faster than the variant which uses loops. + b[31] = byte(f.n[0] & eightBitsMask) + b[30] = byte((f.n[0] >> 8) & eightBitsMask) + b[29] = byte((f.n[0] >> 16) & eightBitsMask) + b[28] = byte((f.n[0]>>24)&twoBitsMask | (f.n[1]&sixBitsMask)<<2) + b[27] = byte((f.n[1] >> 6) & eightBitsMask) + b[26] = byte((f.n[1] >> 14) & eightBitsMask) + b[25] = byte((f.n[1]>>22)&fourBitsMask | (f.n[2]&fourBitsMask)<<4) + b[24] = byte((f.n[2] >> 4) & eightBitsMask) + b[23] = byte((f.n[2] >> 12) & eightBitsMask) + b[22] = byte((f.n[2]>>20)&sixBitsMask | (f.n[3]&twoBitsMask)<<6) + b[21] = byte((f.n[3] >> 2) & eightBitsMask) + b[20] = byte((f.n[3] >> 10) & eightBitsMask) + b[19] = byte((f.n[3] >> 18) & eightBitsMask) + b[18] = byte(f.n[4] & eightBitsMask) + b[17] = byte((f.n[4] >> 8) & eightBitsMask) + b[16] = byte((f.n[4] >> 16) & eightBitsMask) + b[15] = byte((f.n[4]>>24)&twoBitsMask | (f.n[5]&sixBitsMask)<<2) + b[14] = byte((f.n[5] >> 6) & eightBitsMask) + b[13] = byte((f.n[5] >> 14) & eightBitsMask) + b[12] = byte((f.n[5]>>22)&fourBitsMask | (f.n[6]&fourBitsMask)<<4) + b[11] = byte((f.n[6] >> 4) & eightBitsMask) + b[10] = byte((f.n[6] >> 12) & eightBitsMask) + b[9] = byte((f.n[6]>>20)&sixBitsMask | (f.n[7]&twoBitsMask)<<6) + b[8] = byte((f.n[7] >> 2) & eightBitsMask) + b[7] = byte((f.n[7] >> 10) & eightBitsMask) + b[6] = byte((f.n[7] >> 18) & eightBitsMask) + b[5] = byte(f.n[8] & eightBitsMask) + b[4] = byte((f.n[8] >> 8) & eightBitsMask) + b[3] = byte((f.n[8] >> 16) & eightBitsMask) + b[2] = byte((f.n[8]>>24)&twoBitsMask | (f.n[9]&sixBitsMask)<<2) + b[1] = byte((f.n[9] >> 6) & eightBitsMask) + b[0] = byte((f.n[9] >> 14) & eightBitsMask) +} + +// Bytes unpacks the field value to a 32-byte big-endian value. See PutBytes +// for a variant that allows the a buffer to be passed which can be useful to +// to cut down on the number of allocations by allowing the caller to reuse a +// buffer. +// +// The field value must be normalized for this function to return correct +// result. +func (f *fieldVal) Bytes() *[32]byte { + b := new([32]byte) + f.PutBytes(b) + return b +} + +// IsZero returns whether or not the field value is equal to zero. +func (f *fieldVal) IsZero() bool { + // The value can only be zero if no bits are set in any of the words. + // This is a constant time implementation. + bits := f.n[0] | f.n[1] | f.n[2] | f.n[3] | f.n[4] | + f.n[5] | f.n[6] | f.n[7] | f.n[8] | f.n[9] + + return bits == 0 +} + +// IsOdd returns whether or not the field value is an odd number. +// +// The field value must be normalized for this function to return correct +// result. +func (f *fieldVal) IsOdd() bool { + // Only odd numbers have the bottom bit set. + return f.n[0]&1 == 1 +} + +// Equals returns whether or not the two field values are the same. Both +// field values being compared must be normalized for this function to return +// the correct result. +func (f *fieldVal) Equals(val *fieldVal) bool { + // Xor only sets bits when they are different, so the two field values + // can only be the same if no bits are set after xoring each word. + // This is a constant time implementation. + bits := (f.n[0] ^ val.n[0]) | (f.n[1] ^ val.n[1]) | (f.n[2] ^ val.n[2]) | + (f.n[3] ^ val.n[3]) | (f.n[4] ^ val.n[4]) | (f.n[5] ^ val.n[5]) | + (f.n[6] ^ val.n[6]) | (f.n[7] ^ val.n[7]) | (f.n[8] ^ val.n[8]) | + (f.n[9] ^ val.n[9]) + + return bits == 0 +} + +// NegateVal negates the passed value and stores the result in f. The caller +// must provide the magnitude of the passed value for a correct result. +// +// The field value is returned to support chaining. This enables syntax like: +// f.NegateVal(f2).AddInt(1) so that f = -f2 + 1. +func (f *fieldVal) NegateVal(val *fieldVal, magnitude uint32) *fieldVal { + // Negation in the field is just the prime minus the value. However, + // in order to allow negation against a field value without having to + // normalize/reduce it first, multiply by the magnitude (that is how + // "far" away it is from the normalized value) to adjust. Also, since + // negating a value pushes it one more order of magnitude away from the + // normalized range, add 1 to compensate. + // + // For some intuition here, imagine you're performing mod 12 arithmetic + // (picture a clock) and you are negating the number 7. So you start at + // 12 (which is of course 0 under mod 12) and count backwards (left on + // the clock) 7 times to arrive at 5. Notice this is just 12-7 = 5. + // Now, assume you're starting with 19, which is a number that is + // already larger than the modulus and congruent to 7 (mod 12). When a + // value is already in the desired range, its magnitude is 1. Since 19 + // is an additional "step", its magnitude (mod 12) is 2. Since any + // multiple of the modulus is conguent to zero (mod m), the answer can + // be shortcut by simply mulplying the magnitude by the modulus and + // subtracting. Keeping with the example, this would be (2*12)-19 = 5. + f.n[0] = (magnitude+1)*fieldPrimeWordZero - val.n[0] + f.n[1] = (magnitude+1)*fieldPrimeWordOne - val.n[1] + f.n[2] = (magnitude+1)*fieldBaseMask - val.n[2] + f.n[3] = (magnitude+1)*fieldBaseMask - val.n[3] + f.n[4] = (magnitude+1)*fieldBaseMask - val.n[4] + f.n[5] = (magnitude+1)*fieldBaseMask - val.n[5] + f.n[6] = (magnitude+1)*fieldBaseMask - val.n[6] + f.n[7] = (magnitude+1)*fieldBaseMask - val.n[7] + f.n[8] = (magnitude+1)*fieldBaseMask - val.n[8] + f.n[9] = (magnitude+1)*fieldMSBMask - val.n[9] + + return f +} + +// Negate negates the field value. The existing field value is modified. The +// caller must provide the magnitude of the field value for a correct result. +// +// The field value is returned to support chaining. This enables syntax like: +// f.Negate().AddInt(1) so that f = -f + 1. +func (f *fieldVal) Negate(magnitude uint32) *fieldVal { + return f.NegateVal(f, magnitude) +} + +// AddInt adds the passed integer to the existing field value and stores the +// result in f. This is a convenience function since it is fairly common to +// perform some arithemetic with small native integers. +// +// The field value is returned to support chaining. This enables syntax like: +// f.AddInt(1).Add(f2) so that f = f + 1 + f2. +func (f *fieldVal) AddInt(ui uint) *fieldVal { + // Since the field representation intentionally provides overflow bits, + // it's ok to use carryless addition as the carry bit is safely part of + // the word and will be normalized out. + f.n[0] += uint32(ui) + + return f +} + +// Add adds the passed value to the existing field value and stores the result +// in f. +// +// The field value is returned to support chaining. This enables syntax like: +// f.Add(f2).AddInt(1) so that f = f + f2 + 1. +func (f *fieldVal) Add(val *fieldVal) *fieldVal { + // Since the field representation intentionally provides overflow bits, + // it's ok to use carryless addition as the carry bit is safely part of + // each word and will be normalized out. This could obviously be done + // in a loop, but the unrolled version is faster. + f.n[0] += val.n[0] + f.n[1] += val.n[1] + f.n[2] += val.n[2] + f.n[3] += val.n[3] + f.n[4] += val.n[4] + f.n[5] += val.n[5] + f.n[6] += val.n[6] + f.n[7] += val.n[7] + f.n[8] += val.n[8] + f.n[9] += val.n[9] + + return f +} + +// Add2 adds the passed two field values together and stores the result in f. +// +// The field value is returned to support chaining. This enables syntax like: +// f3.Add2(f, f2).AddInt(1) so that f3 = f + f2 + 1. +func (f *fieldVal) Add2(val *fieldVal, val2 *fieldVal) *fieldVal { + // Since the field representation intentionally provides overflow bits, + // it's ok to use carryless addition as the carry bit is safely part of + // each word and will be normalized out. This could obviously be done + // in a loop, but the unrolled version is faster. + f.n[0] = val.n[0] + val2.n[0] + f.n[1] = val.n[1] + val2.n[1] + f.n[2] = val.n[2] + val2.n[2] + f.n[3] = val.n[3] + val2.n[3] + f.n[4] = val.n[4] + val2.n[4] + f.n[5] = val.n[5] + val2.n[5] + f.n[6] = val.n[6] + val2.n[6] + f.n[7] = val.n[7] + val2.n[7] + f.n[8] = val.n[8] + val2.n[8] + f.n[9] = val.n[9] + val2.n[9] + + return f +} + +// MulInt multiplies the field value by the passed int and stores the result in +// f. Note that this function can overflow if multiplying the value by any of +// the individual words exceeds a max uint32. Therefore it is important that +// the caller ensures no overflows will occur before using this function. +// +// The field value is returned to support chaining. This enables syntax like: +// f.MulInt(2).Add(f2) so that f = 2 * f + f2. +func (f *fieldVal) MulInt(val uint) *fieldVal { + // Since each word of the field representation can hold up to + // fieldOverflowBits extra bits which will be normalized out, it's safe + // to multiply each word without using a larger type or carry + // propagation so long as the values won't overflow a uint32. This + // could obviously be done in a loop, but the unrolled version is + // faster. + ui := uint32(val) + f.n[0] *= ui + f.n[1] *= ui + f.n[2] *= ui + f.n[3] *= ui + f.n[4] *= ui + f.n[5] *= ui + f.n[6] *= ui + f.n[7] *= ui + f.n[8] *= ui + f.n[9] *= ui + + return f +} + +// Mul multiplies the passed value to the existing field value and stores the +// result in f. Note that this function can overflow if multiplying any +// of the individual words exceeds a max uint32. In practice, this means the +// magnitude of either value invovled in the multiplication must be a max of +// 8. +// +// The field value is returned to support chaining. This enables syntax like: +// f.Mul(f2).AddInt(1) so that f = (f * f2) + 1. +func (f *fieldVal) Mul(val *fieldVal) *fieldVal { + return f.Mul2(f, val) +} + +// Mul2 multiplies the passed two field values together and stores the result +// result in f. Note that this function can overflow if multiplying any of +// the individual words exceeds a max uint32. In practice, this means the +// magnitude of either value invovled in the multiplication must be a max of +// 8. +// +// The field value is returned to support chaining. This enables syntax like: +// f3.Mul2(f, f2).AddInt(1) so that f3 = (f * f2) + 1. +func (f *fieldVal) Mul2(val *fieldVal, val2 *fieldVal) *fieldVal { + // This could be done with a couple of for loops and an array to store + // the intermediate terms, but this unrolled version is significantly + // faster. + + // Terms for 2^(fieldBase*0). + m := uint64(val.n[0]) * uint64(val2.n[0]) + t0 := m & fieldBaseMask + + // Terms for 2^(fieldBase*1). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[1]) + + uint64(val.n[1])*uint64(val2.n[0]) + t1 := m & fieldBaseMask + + // Terms for 2^(fieldBase*2). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[2]) + + uint64(val.n[1])*uint64(val2.n[1]) + + uint64(val.n[2])*uint64(val2.n[0]) + t2 := m & fieldBaseMask + + // Terms for 2^(fieldBase*3). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[3]) + + uint64(val.n[1])*uint64(val2.n[2]) + + uint64(val.n[2])*uint64(val2.n[1]) + + uint64(val.n[3])*uint64(val2.n[0]) + t3 := m & fieldBaseMask + + // Terms for 2^(fieldBase*4). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[4]) + + uint64(val.n[1])*uint64(val2.n[3]) + + uint64(val.n[2])*uint64(val2.n[2]) + + uint64(val.n[3])*uint64(val2.n[1]) + + uint64(val.n[4])*uint64(val2.n[0]) + t4 := m & fieldBaseMask + + // Terms for 2^(fieldBase*5). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[5]) + + uint64(val.n[1])*uint64(val2.n[4]) + + uint64(val.n[2])*uint64(val2.n[3]) + + uint64(val.n[3])*uint64(val2.n[2]) + + uint64(val.n[4])*uint64(val2.n[1]) + + uint64(val.n[5])*uint64(val2.n[0]) + t5 := m & fieldBaseMask + + // Terms for 2^(fieldBase*6). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[6]) + + uint64(val.n[1])*uint64(val2.n[5]) + + uint64(val.n[2])*uint64(val2.n[4]) + + uint64(val.n[3])*uint64(val2.n[3]) + + uint64(val.n[4])*uint64(val2.n[2]) + + uint64(val.n[5])*uint64(val2.n[1]) + + uint64(val.n[6])*uint64(val2.n[0]) + t6 := m & fieldBaseMask + + // Terms for 2^(fieldBase*7). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[7]) + + uint64(val.n[1])*uint64(val2.n[6]) + + uint64(val.n[2])*uint64(val2.n[5]) + + uint64(val.n[3])*uint64(val2.n[4]) + + uint64(val.n[4])*uint64(val2.n[3]) + + uint64(val.n[5])*uint64(val2.n[2]) + + uint64(val.n[6])*uint64(val2.n[1]) + + uint64(val.n[7])*uint64(val2.n[0]) + t7 := m & fieldBaseMask + + // Terms for 2^(fieldBase*8). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[8]) + + uint64(val.n[1])*uint64(val2.n[7]) + + uint64(val.n[2])*uint64(val2.n[6]) + + uint64(val.n[3])*uint64(val2.n[5]) + + uint64(val.n[4])*uint64(val2.n[4]) + + uint64(val.n[5])*uint64(val2.n[3]) + + uint64(val.n[6])*uint64(val2.n[2]) + + uint64(val.n[7])*uint64(val2.n[1]) + + uint64(val.n[8])*uint64(val2.n[0]) + t8 := m & fieldBaseMask + + // Terms for 2^(fieldBase*9). + m = (m >> fieldBase) + + uint64(val.n[0])*uint64(val2.n[9]) + + uint64(val.n[1])*uint64(val2.n[8]) + + uint64(val.n[2])*uint64(val2.n[7]) + + uint64(val.n[3])*uint64(val2.n[6]) + + uint64(val.n[4])*uint64(val2.n[5]) + + uint64(val.n[5])*uint64(val2.n[4]) + + uint64(val.n[6])*uint64(val2.n[3]) + + uint64(val.n[7])*uint64(val2.n[2]) + + uint64(val.n[8])*uint64(val2.n[1]) + + uint64(val.n[9])*uint64(val2.n[0]) + t9 := m & fieldBaseMask + + // Terms for 2^(fieldBase*10). + m = (m >> fieldBase) + + uint64(val.n[1])*uint64(val2.n[9]) + + uint64(val.n[2])*uint64(val2.n[8]) + + uint64(val.n[3])*uint64(val2.n[7]) + + uint64(val.n[4])*uint64(val2.n[6]) + + uint64(val.n[5])*uint64(val2.n[5]) + + uint64(val.n[6])*uint64(val2.n[4]) + + uint64(val.n[7])*uint64(val2.n[3]) + + uint64(val.n[8])*uint64(val2.n[2]) + + uint64(val.n[9])*uint64(val2.n[1]) + t10 := m & fieldBaseMask + + // Terms for 2^(fieldBase*11). + m = (m >> fieldBase) + + uint64(val.n[2])*uint64(val2.n[9]) + + uint64(val.n[3])*uint64(val2.n[8]) + + uint64(val.n[4])*uint64(val2.n[7]) + + uint64(val.n[5])*uint64(val2.n[6]) + + uint64(val.n[6])*uint64(val2.n[5]) + + uint64(val.n[7])*uint64(val2.n[4]) + + uint64(val.n[8])*uint64(val2.n[3]) + + uint64(val.n[9])*uint64(val2.n[2]) + t11 := m & fieldBaseMask + + // Terms for 2^(fieldBase*12). + m = (m >> fieldBase) + + uint64(val.n[3])*uint64(val2.n[9]) + + uint64(val.n[4])*uint64(val2.n[8]) + + uint64(val.n[5])*uint64(val2.n[7]) + + uint64(val.n[6])*uint64(val2.n[6]) + + uint64(val.n[7])*uint64(val2.n[5]) + + uint64(val.n[8])*uint64(val2.n[4]) + + uint64(val.n[9])*uint64(val2.n[3]) + t12 := m & fieldBaseMask + + // Terms for 2^(fieldBase*13). + m = (m >> fieldBase) + + uint64(val.n[4])*uint64(val2.n[9]) + + uint64(val.n[5])*uint64(val2.n[8]) + + uint64(val.n[6])*uint64(val2.n[7]) + + uint64(val.n[7])*uint64(val2.n[6]) + + uint64(val.n[8])*uint64(val2.n[5]) + + uint64(val.n[9])*uint64(val2.n[4]) + t13 := m & fieldBaseMask + + // Terms for 2^(fieldBase*14). + m = (m >> fieldBase) + + uint64(val.n[5])*uint64(val2.n[9]) + + uint64(val.n[6])*uint64(val2.n[8]) + + uint64(val.n[7])*uint64(val2.n[7]) + + uint64(val.n[8])*uint64(val2.n[6]) + + uint64(val.n[9])*uint64(val2.n[5]) + t14 := m & fieldBaseMask + + // Terms for 2^(fieldBase*15). + m = (m >> fieldBase) + + uint64(val.n[6])*uint64(val2.n[9]) + + uint64(val.n[7])*uint64(val2.n[8]) + + uint64(val.n[8])*uint64(val2.n[7]) + + uint64(val.n[9])*uint64(val2.n[6]) + t15 := m & fieldBaseMask + + // Terms for 2^(fieldBase*16). + m = (m >> fieldBase) + + uint64(val.n[7])*uint64(val2.n[9]) + + uint64(val.n[8])*uint64(val2.n[8]) + + uint64(val.n[9])*uint64(val2.n[7]) + t16 := m & fieldBaseMask + + // Terms for 2^(fieldBase*17). + m = (m >> fieldBase) + + uint64(val.n[8])*uint64(val2.n[9]) + + uint64(val.n[9])*uint64(val2.n[8]) + t17 := m & fieldBaseMask + + // Terms for 2^(fieldBase*18). + m = (m >> fieldBase) + uint64(val.n[9])*uint64(val2.n[9]) + t18 := m & fieldBaseMask + + // What's left is for 2^(fieldBase*19). + t19 := m >> fieldBase + + // At this point, all of the terms are grouped into their respective + // base. + // + // Per [HAC] section 14.3.4: Reduction method of moduli of special form, + // when the modulus is of the special form m = b^t - c, highly efficient + // reduction can be achieved per the provided algorithm. + // + // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits + // this criteria. + // + // 4294968273 in field representation (base 2^26) is: + // n[0] = 977 + // n[1] = 64 + // That is to say (2^26 * 64) + 977 = 4294968273 + // + // Since each word is in base 26, the upper terms (t10 and up) start + // at 260 bits (versus the final desired range of 256 bits), so the + // field representation of 'c' from above needs to be adjusted for the + // extra 4 bits by multiplying it by 2^4 = 16. 4294968273 * 16 = + // 68719492368. Thus, the adjusted field representation of 'c' is: + // n[0] = 977 * 16 = 15632 + // n[1] = 64 * 16 = 1024 + // That is to say (2^26 * 1024) + 15632 = 68719492368 + // + // To reduce the final term, t19, the entire 'c' value is needed instead + // of only n[0] because there are no more terms left to handle n[1]. + // This means there might be some magnitude left in the upper bits that + // is handled below. + m = t0 + t10*15632 + t0 = m & fieldBaseMask + m = (m >> fieldBase) + t1 + t10*1024 + t11*15632 + t1 = m & fieldBaseMask + m = (m >> fieldBase) + t2 + t11*1024 + t12*15632 + t2 = m & fieldBaseMask + m = (m >> fieldBase) + t3 + t12*1024 + t13*15632 + t3 = m & fieldBaseMask + m = (m >> fieldBase) + t4 + t13*1024 + t14*15632 + t4 = m & fieldBaseMask + m = (m >> fieldBase) + t5 + t14*1024 + t15*15632 + t5 = m & fieldBaseMask + m = (m >> fieldBase) + t6 + t15*1024 + t16*15632 + t6 = m & fieldBaseMask + m = (m >> fieldBase) + t7 + t16*1024 + t17*15632 + t7 = m & fieldBaseMask + m = (m >> fieldBase) + t8 + t17*1024 + t18*15632 + t8 = m & fieldBaseMask + m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368 + t9 = m & fieldMSBMask + m = m >> fieldMSBBits + + // At this point, if the magnitude is greater than 0, the overall value + // is greater than the max possible 256-bit value. In particular, it is + // "how many times larger" than the max value it is. + // + // The algorithm presented in [HAC] section 14.3.4 repeats until the + // quotient is zero. However, due to the above, we already know at + // least how many times we would need to repeat as it's the value + // currently in m. Thus we can simply multiply the magnitude by the + // field representation of the prime and do a single iteration. Notice + // that nothing will be changed when the magnitude is zero, so we could + // skip this in that case, however always running regardless allows it + // to run in constant time. The final result will be in the range + // 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a + // magnitude of 1, but it is denormalized. + d := t0 + m*977 + f.n[0] = uint32(d & fieldBaseMask) + d = (d >> fieldBase) + t1 + m*64 + f.n[1] = uint32(d & fieldBaseMask) + f.n[2] = uint32((d >> fieldBase) + t2) + f.n[3] = uint32(t3) + f.n[4] = uint32(t4) + f.n[5] = uint32(t5) + f.n[6] = uint32(t6) + f.n[7] = uint32(t7) + f.n[8] = uint32(t8) + f.n[9] = uint32(t9) + + return f +} + +// Square squares the field value. The existing field value is modified. Note +// that this function can overflow if multiplying any of the individual words +// exceeds a max uint32. In practice, this means the magnitude of the field +// must be a max of 8 to prevent overflow. +// +// The field value is returned to support chaining. This enables syntax like: +// f.Square().Mul(f2) so that f = f^2 * f2. +func (f *fieldVal) Square() *fieldVal { + return f.SquareVal(f) +} + +// SquareVal squares the passed value and stores the result in f. Note that +// this function can overflow if multiplying any of the individual words +// exceeds a max uint32. In practice, this means the magnitude of the field +// being squred must be a max of 8 to prevent overflow. +// +// The field value is returned to support chaining. This enables syntax like: +// f3.SquareVal(f).Mul(f) so that f3 = f^2 * f = f^3. +func (f *fieldVal) SquareVal(val *fieldVal) *fieldVal { + // This could be done with a couple of for loops and an array to store + // the intermediate terms, but this unrolled version is significantly + // faster. + + // Terms for 2^(fieldBase*0). + m := uint64(val.n[0]) * uint64(val.n[0]) + t0 := m & fieldBaseMask + + // Terms for 2^(fieldBase*1). + m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[1]) + t1 := m & fieldBaseMask + + // Terms for 2^(fieldBase*2). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[2]) + + uint64(val.n[1])*uint64(val.n[1]) + t2 := m & fieldBaseMask + + // Terms for 2^(fieldBase*3). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[3]) + + 2*uint64(val.n[1])*uint64(val.n[2]) + t3 := m & fieldBaseMask + + // Terms for 2^(fieldBase*4). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[4]) + + 2*uint64(val.n[1])*uint64(val.n[3]) + + uint64(val.n[2])*uint64(val.n[2]) + t4 := m & fieldBaseMask + + // Terms for 2^(fieldBase*5). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[5]) + + 2*uint64(val.n[1])*uint64(val.n[4]) + + 2*uint64(val.n[2])*uint64(val.n[3]) + t5 := m & fieldBaseMask + + // Terms for 2^(fieldBase*6). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[6]) + + 2*uint64(val.n[1])*uint64(val.n[5]) + + 2*uint64(val.n[2])*uint64(val.n[4]) + + uint64(val.n[3])*uint64(val.n[3]) + t6 := m & fieldBaseMask + + // Terms for 2^(fieldBase*7). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[7]) + + 2*uint64(val.n[1])*uint64(val.n[6]) + + 2*uint64(val.n[2])*uint64(val.n[5]) + + 2*uint64(val.n[3])*uint64(val.n[4]) + t7 := m & fieldBaseMask + + // Terms for 2^(fieldBase*8). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[8]) + + 2*uint64(val.n[1])*uint64(val.n[7]) + + 2*uint64(val.n[2])*uint64(val.n[6]) + + 2*uint64(val.n[3])*uint64(val.n[5]) + + uint64(val.n[4])*uint64(val.n[4]) + t8 := m & fieldBaseMask + + // Terms for 2^(fieldBase*9). + m = (m >> fieldBase) + + 2*uint64(val.n[0])*uint64(val.n[9]) + + 2*uint64(val.n[1])*uint64(val.n[8]) + + 2*uint64(val.n[2])*uint64(val.n[7]) + + 2*uint64(val.n[3])*uint64(val.n[6]) + + 2*uint64(val.n[4])*uint64(val.n[5]) + t9 := m & fieldBaseMask + + // Terms for 2^(fieldBase*10). + m = (m >> fieldBase) + + 2*uint64(val.n[1])*uint64(val.n[9]) + + 2*uint64(val.n[2])*uint64(val.n[8]) + + 2*uint64(val.n[3])*uint64(val.n[7]) + + 2*uint64(val.n[4])*uint64(val.n[6]) + + uint64(val.n[5])*uint64(val.n[5]) + t10 := m & fieldBaseMask + + // Terms for 2^(fieldBase*11). + m = (m >> fieldBase) + + 2*uint64(val.n[2])*uint64(val.n[9]) + + 2*uint64(val.n[3])*uint64(val.n[8]) + + 2*uint64(val.n[4])*uint64(val.n[7]) + + 2*uint64(val.n[5])*uint64(val.n[6]) + t11 := m & fieldBaseMask + + // Terms for 2^(fieldBase*12). + m = (m >> fieldBase) + + 2*uint64(val.n[3])*uint64(val.n[9]) + + 2*uint64(val.n[4])*uint64(val.n[8]) + + 2*uint64(val.n[5])*uint64(val.n[7]) + + uint64(val.n[6])*uint64(val.n[6]) + t12 := m & fieldBaseMask + + // Terms for 2^(fieldBase*13). + m = (m >> fieldBase) + + 2*uint64(val.n[4])*uint64(val.n[9]) + + 2*uint64(val.n[5])*uint64(val.n[8]) + + 2*uint64(val.n[6])*uint64(val.n[7]) + t13 := m & fieldBaseMask + + // Terms for 2^(fieldBase*14). + m = (m >> fieldBase) + + 2*uint64(val.n[5])*uint64(val.n[9]) + + 2*uint64(val.n[6])*uint64(val.n[8]) + + uint64(val.n[7])*uint64(val.n[7]) + t14 := m & fieldBaseMask + + // Terms for 2^(fieldBase*15). + m = (m >> fieldBase) + + 2*uint64(val.n[6])*uint64(val.n[9]) + + 2*uint64(val.n[7])*uint64(val.n[8]) + t15 := m & fieldBaseMask + + // Terms for 2^(fieldBase*16). + m = (m >> fieldBase) + + 2*uint64(val.n[7])*uint64(val.n[9]) + + uint64(val.n[8])*uint64(val.n[8]) + t16 := m & fieldBaseMask + + // Terms for 2^(fieldBase*17). + m = (m >> fieldBase) + 2*uint64(val.n[8])*uint64(val.n[9]) + t17 := m & fieldBaseMask + + // Terms for 2^(fieldBase*18). + m = (m >> fieldBase) + uint64(val.n[9])*uint64(val.n[9]) + t18 := m & fieldBaseMask + + // What's left is for 2^(fieldBase*19). + t19 := m >> fieldBase + + // At this point, all of the terms are grouped into their respective + // base. + // + // Per [HAC] section 14.3.4: Reduction method of moduli of special form, + // when the modulus is of the special form m = b^t - c, highly efficient + // reduction can be achieved per the provided algorithm. + // + // The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits + // this criteria. + // + // 4294968273 in field representation (base 2^26) is: + // n[0] = 977 + // n[1] = 64 + // That is to say (2^26 * 64) + 977 = 4294968273 + // + // Since each word is in base 26, the upper terms (t10 and up) start + // at 260 bits (versus the final desired range of 256 bits), so the + // field representation of 'c' from above needs to be adjusted for the + // extra 4 bits by multiplying it by 2^4 = 16. 4294968273 * 16 = + // 68719492368. Thus, the adjusted field representation of 'c' is: + // n[0] = 977 * 16 = 15632 + // n[1] = 64 * 16 = 1024 + // That is to say (2^26 * 1024) + 15632 = 68719492368 + // + // To reduce the final term, t19, the entire 'c' value is needed instead + // of only n[0] because there are no more terms left to handle n[1]. + // This means there might be some magnitude left in the upper bits that + // is handled below. + m = t0 + t10*15632 + t0 = m & fieldBaseMask + m = (m >> fieldBase) + t1 + t10*1024 + t11*15632 + t1 = m & fieldBaseMask + m = (m >> fieldBase) + t2 + t11*1024 + t12*15632 + t2 = m & fieldBaseMask + m = (m >> fieldBase) + t3 + t12*1024 + t13*15632 + t3 = m & fieldBaseMask + m = (m >> fieldBase) + t4 + t13*1024 + t14*15632 + t4 = m & fieldBaseMask + m = (m >> fieldBase) + t5 + t14*1024 + t15*15632 + t5 = m & fieldBaseMask + m = (m >> fieldBase) + t6 + t15*1024 + t16*15632 + t6 = m & fieldBaseMask + m = (m >> fieldBase) + t7 + t16*1024 + t17*15632 + t7 = m & fieldBaseMask + m = (m >> fieldBase) + t8 + t17*1024 + t18*15632 + t8 = m & fieldBaseMask + m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368 + t9 = m & fieldMSBMask + m = m >> fieldMSBBits + + // At this point, if the magnitude is greater than 0, the overall value + // is greater than the max possible 256-bit value. In particular, it is + // "how many times larger" than the max value it is. + // + // The algorithm presented in [HAC] section 14.3.4 repeats until the + // quotient is zero. However, due to the above, we already know at + // least how many times we would need to repeat as it's the value + // currently in m. Thus we can simply multiply the magnitude by the + // field representation of the prime and do a single iteration. Notice + // that nothing will be changed when the magnitude is zero, so we could + // skip this in that case, however always running regardless allows it + // to run in constant time. The final result will be in the range + // 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a + // magnitude of 1, but it is denormalized. + n := t0 + m*977 + f.n[0] = uint32(n & fieldBaseMask) + n = (n >> fieldBase) + t1 + m*64 + f.n[1] = uint32(n & fieldBaseMask) + f.n[2] = uint32((n >> fieldBase) + t2) + f.n[3] = uint32(t3) + f.n[4] = uint32(t4) + f.n[5] = uint32(t5) + f.n[6] = uint32(t6) + f.n[7] = uint32(t7) + f.n[8] = uint32(t8) + f.n[9] = uint32(t9) + + return f +} + +// Inverse finds the modular multiplicative inverse of the field value. The +// existing field value is modified. +// +// The field value is returned to support chaining. This enables syntax like: +// f.Inverse().Mul(f2) so that f = f^-1 * f2. +func (f *fieldVal) Inverse() *fieldVal { + // Fermat's little theorem states that for a nonzero number a and prime + // prime p, a^(p-1) = 1 (mod p). Since the multipliciative inverse is + // a*b = 1 (mod p), it follows that b = a*a^(p-2) = a^(p-1) = 1 (mod p). + // Thus, a^(p-2) is the multiplicative inverse. + // + // In order to efficiently compute a^(p-2), p-2 needs to be split into + // a sequence of squares and multipications that minimizes the number of + // multiplications needed (since they are more costly than squarings). + // Intermediate results are saved and reused as well. + // + // The secp256k1 prime - 2 is 2^256 - 4294968275. + // + // This has a cost of 258 field squarings and 33 field multiplications. + var a2, a3, a4, a10, a11, a21, a42, a45, a63, a1019, a1023 fieldVal + a2.SquareVal(f) + a3.Mul2(&a2, f) + a4.SquareVal(&a2) + a10.SquareVal(&a4).Mul(&a2) + a11.Mul2(&a10, f) + a21.Mul2(&a10, &a11) + a42.SquareVal(&a21) + a45.Mul2(&a42, &a3) + a63.Mul2(&a42, &a21) + a1019.SquareVal(&a63).Square().Square().Square().Mul(&a11) + a1023.Mul2(&a1019, &a4) + f.Set(&a63) // f = a^(2^6 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^11 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^16 - 1024) + f.Mul(&a1023) // f = a^(2^16 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^21 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^26 - 1024) + f.Mul(&a1023) // f = a^(2^26 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^31 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^36 - 1024) + f.Mul(&a1023) // f = a^(2^36 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^41 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^46 - 1024) + f.Mul(&a1023) // f = a^(2^46 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^51 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^56 - 1024) + f.Mul(&a1023) // f = a^(2^56 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^61 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^66 - 1024) + f.Mul(&a1023) // f = a^(2^66 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^71 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^76 - 1024) + f.Mul(&a1023) // f = a^(2^76 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^81 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^86 - 1024) + f.Mul(&a1023) // f = a^(2^86 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^91 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^96 - 1024) + f.Mul(&a1023) // f = a^(2^96 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^101 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^106 - 1024) + f.Mul(&a1023) // f = a^(2^106 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^111 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^116 - 1024) + f.Mul(&a1023) // f = a^(2^116 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^121 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^126 - 1024) + f.Mul(&a1023) // f = a^(2^126 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^131 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^136 - 1024) + f.Mul(&a1023) // f = a^(2^136 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^141 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^146 - 1024) + f.Mul(&a1023) // f = a^(2^146 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^151 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^156 - 1024) + f.Mul(&a1023) // f = a^(2^156 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^161 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^166 - 1024) + f.Mul(&a1023) // f = a^(2^166 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^171 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^176 - 1024) + f.Mul(&a1023) // f = a^(2^176 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^181 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^186 - 1024) + f.Mul(&a1023) // f = a^(2^186 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^191 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^196 - 1024) + f.Mul(&a1023) // f = a^(2^196 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^201 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^206 - 1024) + f.Mul(&a1023) // f = a^(2^206 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^211 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^216 - 1024) + f.Mul(&a1023) // f = a^(2^216 - 1) + f.Square().Square().Square().Square().Square() // f = a^(2^221 - 32) + f.Square().Square().Square().Square().Square() // f = a^(2^226 - 1024) + f.Mul(&a1019) // f = a^(2^226 - 5) + f.Square().Square().Square().Square().Square() // f = a^(2^231 - 160) + f.Square().Square().Square().Square().Square() // f = a^(2^236 - 5120) + f.Mul(&a1023) // f = a^(2^236 - 4097) + f.Square().Square().Square().Square().Square() // f = a^(2^241 - 131104) + f.Square().Square().Square().Square().Square() // f = a^(2^246 - 4195328) + f.Mul(&a1023) // f = a^(2^246 - 4194305) + f.Square().Square().Square().Square().Square() // f = a^(2^251 - 134217760) + f.Square().Square().Square().Square().Square() // f = a^(2^256 - 4294968320) + return f.Mul(&a45) // f = a^(2^256 - 4294968275) = a^(p-2) +} + +// NewFieldVal returns a new field value set to 0. Callers of this package +// don't need to work with field values directly. This is provided for testing +// purposes. +func NewFieldVal() *fieldVal { + return new(fieldVal) +}