diff --git a/btcec.go b/btcec.go
index a2a823ba..28822728 100644
--- a/btcec.go
+++ b/btcec.go
@@ -6,6 +6,10 @@
 
 package btcec
 
+// References:
+//   [SECG]: Recommended Elliptic Curve Domain Parameters
+//     http://www.secg.org/download/aid-784/sec2-v2.pdf
+
 // This package operates, internally, on Jacobian coordinates. For a given
 // (x, y) position on the curve, the Jacobian coordinates are (x1, y1, z1)
 // where x = x1/z1² and y = y1/z1³. The greatest speedups come when the whole
@@ -22,6 +26,13 @@ import (
 //TODO: examine if we need to care about EC optimization as descibed here
 //      https://bitcointalk.org/index.php?topic=155054.0;all
 
+var (
+	// fieldOne is simple the integer 1 in field representation.  It is
+	// used to avoid needing to create it multiple times during the internal
+	// arithmetic.
+	fieldOne = new(fieldVal).SetInt(1)
+)
+
 // KoblitzCurve supports a koblitz curve implementation that fits the ECC Curve
 // interface from crypto/elliptic.
 type KoblitzCurve struct {
@@ -29,200 +40,584 @@ type KoblitzCurve struct {
 	q *big.Int
 }
 
-// Params returns the parameters fro the curve.
+// Params returns the parameters for the curve.
 func (curve *KoblitzCurve) Params() *elliptic.CurveParams {
 	return curve.CurveParams
 }
 
+// bigAffineToField takes an affine point (x, y) as big integers and converts
+// it to an affine point as field values.
+func (curve *KoblitzCurve) bigAffineToField(x, y *big.Int) (*fieldVal, *fieldVal) {
+	x3, y3 := new(fieldVal), new(fieldVal)
+	x3.SetByteSlice(x.Bytes())
+	y3.SetByteSlice(y.Bytes())
+
+	return x3, y3
+}
+
+// fieldJacobianToBigAffine takes a Jacobian point (x, y, z) as field values and
+// converts it to an affine point as big integers.
+func (curve *KoblitzCurve) fieldJacobianToBigAffine(x, y, z *fieldVal) (*big.Int, *big.Int) {
+	// Inversions are expensive and both point addition and point doubling
+	// are faster when working with points that have a z value of one.  So,
+	// if the point needs to be converted to affine, go ahead and normalize
+	// the point itself at the same time as the calculation is the same.
+	var zInv, tempZ fieldVal
+	zInv.Set(z).Inverse()   // zInv = Z^-1
+	tempZ.SquareVal(&zInv)  // tempZ = Z^-2
+	x.Mul(&tempZ)           // X = X/Z^2 (mag: 1)
+	y.Mul(tempZ.Mul(&zInv)) // Y = Y/Z^3 (mag: 1)
+	z.SetInt(1)             // Z = 1 (mag: 1)
+
+	// Normalize the x and y values.
+	x.Normalize()
+	y.Normalize()
+
+	// Convert the field values for the now affine point to big.Ints.
+	x3, y3 := new(big.Int), new(big.Int)
+	x3.SetBytes(x.Bytes()[:])
+	y3.SetBytes(y.Bytes()[:])
+	return x3, y3
+}
+
 // IsOnCurve returns boolean if the point (x,y) is on the curve.
 // Part of the elliptic.Curve interface. This function differs from the
 // crypto/elliptic algorithm since a = 0 not -3.
 func (curve *KoblitzCurve) IsOnCurve(x, y *big.Int) bool {
-	// y² = x³ + b
-	y2 := new(big.Int).Mul(y, y) //y²
-	y2.Mod(y2, curve.P)          //y²%P
+	// Convert big ints to field values for faster arithmetic.
+	fx, fy := curve.bigAffineToField(x, y)
 
-	x3 := new(big.Int).Mul(x, x) //x²
-	x3.Mul(x3, x)                //x³
-
-	x3.Add(x3, curve.B) //x³+B
-	x3.Mod(x3, curve.P) //(x³+B)%P
-
-	return x3.Cmp(y2) == 0
+	// Elliptic curve equation for secp256k1 is: y^2 = x^3 + 7
+	y2 := new(fieldVal).SquareVal(fy).Normalize()
+	result := new(fieldVal).SquareVal(fx).Mul(fx).AddInt(7).Normalize()
+	return y2.Equals(result)
 }
 
-// zForAffine returns a Jacobian Z value for the affine point (x, y). If x and
-// y are zero, it assumes that they represent the point at infinity because (0,
-// 0) is not on the any of the curves handled here.
-func zForAffine(x, y *big.Int) *big.Int {
-	z := new(big.Int)
-	if x.Sign() != 0 || y.Sign() != 0 {
-		z.SetInt64(1)
-	}
-	return z
-}
+// addZ1AndZ2EqualsOne adds two Jacobian points that are already known to have
+// z values of 1 and stores the result in (x3, y3, z3).  That is to say
+// (x1, y1, 1) + (x2, y2, 1) = (x3, y3, z3).  It performs faster addition than
+// the generic add routine since less arithmetic is needed due to the ability to
+// avoid the z value multiplications.
+func (curve *KoblitzCurve) addZ1AndZ2EqualsOne(x1, y1, x2, y2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-mmadd-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// H = X2-X1, HH = H^2, I = 4*HH, J = H*I, r = 2*(Y2-Y1), V = X1*I
+	// X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*Y1*J, Z3 = 2*H
+	//
+	// This results in a cost of 4 field multiplications, 2 field squarings,
+	// 6 field additions, and 5 integer multiplications.
 
-// affineFromJacobian reverses the Jacobian transform. See the comment at the
-// top of the file. If the point is ∞ it returns 0, 0.
-func (curve *KoblitzCurve) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.Int) {
-	if z.Sign() == 0 {
-		return new(big.Int), new(big.Int)
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity per the group law for elliptic curve cryptography.
+	x1.Normalize()
+	y1.Normalize()
+	x2.Normalize()
+	y2.Normalize()
+	if x1.Equals(x2) {
+		if y1.Equals(y2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, fieldOne, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
 	}
 
-	zinv := new(big.Int).ModInverse(z, curve.P)
-	zinvsq := new(big.Int).Mul(zinv, zinv)
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var h, i, j, r, v fieldVal
+	var negJ, neg2V, negX3 fieldVal
+	h.Set(x1).Negate(1).Add(x2)                // H = X2-X1 (mag: 3)
+	i.SquareVal(&h).MulInt(4)                  // I = 4*H^2 (mag: 4)
+	j.Mul2(&h, &i)                             // J = H*I (mag: 1)
+	r.Set(y1).Negate(1).Add(y2).MulInt(2)      // r = 2*(Y2-Y1) (mag: 6)
+	v.Mul2(x1, &i)                             // V = X1*I (mag: 1)
+	negJ.Set(&j).Negate(1)                     // negJ = -J (mag: 2)
+	neg2V.Set(&v).MulInt(2).Negate(2)          // neg2V = -(2*V) (mag: 3)
+	x3.Set(&r).Square().Add(&negJ).Add(&neg2V) // X3 = r^2-J-2*V (mag: 6)
+	negX3.Set(x3).Negate(6)                    // negX3 = -X3 (mag: 7)
+	j.Mul(y1).MulInt(2).Negate(2)              // J = -(2*Y1*J) (mag: 3)
+	y3.Set(&v).Add(&negX3).Mul(&r).Add(&j)     // Y3 = r*(V-X3)-2*Y1*J (mag: 4)
+	z3.Set(&h).MulInt(2)                       // Z3 = 2*H (mag: 6)
 
-	xOut = new(big.Int).Mul(x, zinvsq)
-	xOut.Mod(xOut, curve.P)
-	zinvsq.Mul(zinvsq, zinv)
-	yOut = new(big.Int).Mul(y, zinvsq)
-	yOut.Mod(yOut, curve.P)
-	return
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
 }
 
-// Add returns the sum of (x1,y1 and (x2,y2). Part of the elliptic.Curve
-// interface.
-func (curve *KoblitzCurve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
-	z1 := zForAffine(x1, y1)
-	z2 := zForAffine(x2, y2)
-	return curve.affineFromJacobian(curve.addJacobian(x1, y1, z1, x2, y2, z2))
+// addZ1EqualsZ2 adds two Jacobian points that are already known to have the
+// same z value and stores the result in (x3, y3, z3).  That is to say
+// (x1, y1, z1) + (x2, y2, z1) = (x3, y3, z3).  It performs faster addition than
+// the generic add routine since less arithmetic is needed due to the known
+// equivalence.
+func (curve *KoblitzCurve) addZ1EqualsZ2(x1, y1, z1, x2, y2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using a slightly modified version
+	// of the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-mmadd-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// A = X2-X1, B = A^2, C=Y2-Y1, D = C^2, E = X1*B, F = X2*B
+	// X3 = D-E-F, Y3 = C*(E-X3)-Y1*(F-E), Z3 = Z1*A
+	//
+	// This results in a cost of 5 field multiplications, 2 field squarings,
+	// 9 field additions, and 0 integer multiplications.
+
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity per the group law for elliptic curve cryptography.
+	x1.Normalize()
+	y1.Normalize()
+	x2.Normalize()
+	y2.Normalize()
+	if x1.Equals(x2) {
+		if y1.Equals(y2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, z1, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var a, b, c, d, e, f fieldVal
+	var negX1, negY1, negE, negX3 fieldVal
+	negX1.Set(x1).Negate(1)                // negX1 = -X1 (mag: 2)
+	negY1.Set(y1).Negate(1)                // negY1 = -Y1 (mag: 2)
+	a.Set(&negX1).Add(x2)                  // A = X2-X1 (mag: 3)
+	b.SquareVal(&a)                        // B = A^2 (mag: 1)
+	c.Set(&negY1).Add(y2)                  // C = Y2-Y1 (mag: 3)
+	d.SquareVal(&c)                        // D = C^2 (mag: 1)
+	e.Mul2(x1, &b)                         // E = X1*B (mag: 1)
+	negE.Set(&e).Negate(1)                 // negE = -E (mag: 2)
+	f.Mul2(x2, &b)                         // F = X2*B (mag: 1)
+	x3.Add2(&e, &f).Negate(3).Add(&d)      // X3 = D-E-F (mag: 5)
+	negX3.Set(x3).Negate(5).Normalize()    // negX3 = -X3 (mag: 1)
+	y3.Set(y1).Mul(f.Add(&negE)).Negate(3) // Y3 = -(Y1*(F-E)) (mag: 4)
+	y3.Add(e.Add(&negX3).Mul(&c))          // Y3 = C*(E-X3)+Y3 (mag: 5)
+	z3.Mul2(z1, &a)                        // Z3 = Z1*A (mag: 1)
+
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
 }
 
-// addJacobian takes two points in Jacobian coordinates, (x1, y1, z1) and
-// (x2, y2, z2) and returns their sum, also in Jacobian form.
-func (curve *KoblitzCurve) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int, *big.Int, *big.Int) {
-	// See http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
-	x3, y3, z3 := new(big.Int), new(big.Int), new(big.Int)
-	if z1.Sign() == 0 {
+// addZ2EqualsOne adds two Jacobian points when the second point is already
+// known to have a z value of 1 (and the z value for the first point is not 1)
+// and stores the result in (x3, y3, z3).  That is to say (x1, y1, z1) +
+// (x2, y2, 1) = (x3, y3, z3).  It performs faster addition than the generic
+// add routine since less arithmetic is needed due to the ability to avoid
+// multiplications by the second point's z value.
+func (curve *KoblitzCurve) addZ2EqualsOne(x1, y1, z1, x2, y2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// Z1Z1 = Z1^2, U2 = X2*Z1Z1, S2 = Y2*Z1*Z1Z1, H = U2-X1, HH = H^2,
+	// I = 4*HH, J = H*I, r = 2*(S2-Y1), V = X1*I
+	// X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*Y1*J, Z3 = (Z1+H)^2-Z1Z1-HH
+	//
+	// This results in a cost of 7 field multiplications, 4 field squarings,
+	// 9 field additions, and 4 integer multiplications.
+
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity per the group law for elliptic curve cryptography.  Since
+	// any number of Jacobian coordinates can represent the same affine
+	// point, the x and y values need to be converted to like terms.  Due to
+	// the assumption made for this function that the second point has a z
+	// value of 1 (z2=1), the first point is already "converted".
+	var z1z1, u2, s2 fieldVal
+	x1.Normalize()
+	y1.Normalize()
+	z1z1.SquareVal(z1)                        // Z1Z1 = Z1^2 (mag: 1)
+	u2.Set(x2).Mul(&z1z1).Normalize()         // U2 = X2*Z1Z1 (mag: 1)
+	s2.Set(y2).Mul(&z1z1).Mul(z1).Normalize() // S2 = Y2*Z1*Z1Z1 (mag: 1)
+	if x1.Equals(&u2) {
+		if y1.Equals(&s2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, z1, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var h, hh, i, j, r, rr, v fieldVal
+	var negX1, negY1, negX3 fieldVal
+	negX1.Set(x1).Negate(1)                // negX1 = -X1 (mag: 2)
+	h.Add2(&u2, &negX1)                    // H = U2-X1 (mag: 3)
+	hh.SquareVal(&h)                       // HH = H^2 (mag: 1)
+	i.Set(&hh).MulInt(4)                   // I = 4 * HH (mag: 4)
+	j.Mul2(&h, &i)                         // J = H*I (mag: 1)
+	negY1.Set(y1).Negate(1)                // negY1 = -Y1 (mag: 2)
+	r.Set(&s2).Add(&negY1).MulInt(2)       // r = 2*(S2-Y1) (mag: 6)
+	rr.SquareVal(&r)                       // rr = r^2 (mag: 1)
+	v.Mul2(x1, &i)                         // V = X1*I (mag: 1)
+	x3.Set(&v).MulInt(2).Add(&j).Negate(3) // X3 = -(J+2*V) (mag: 4)
+	x3.Add(&rr)                            // X3 = r^2+X3 (mag: 5)
+	negX3.Set(x3).Negate(5)                // negX3 = -X3 (mag: 6)
+	y3.Set(y1).Mul(&j).MulInt(2).Negate(2) // Y3 = -(2*Y1*J) (mag: 3)
+	y3.Add(v.Add(&negX3).Mul(&r))          // Y3 = r*(V-X3)+Y3 (mag: 4)
+	z3.Add2(z1, &h).Square()               // Z3 = (Z1+H)^2 (mag: 1)
+	z3.Add(z1z1.Add(&hh).Negate(2))        // Z3 = Z3-(Z1Z1+HH) (mag: 4)
+
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
+}
+
+// addGeneric adds two Jacobian points (x1, y1, z1) and (x2, y2, z2) without any
+// assumptions about the z values of the two points and stores the result in
+// (x3, y3, z3).  That is to say (x1, y1, z1) + (x2, y2, z2) = (x3, y3, z3).  It
+// is the slowest of the add routines due to requiring the most arithmetic.
+func (curve *KoblitzCurve) addGeneric(x1, y1, z1, x2, y2, z2, x3, y3, z3 *fieldVal) {
+	// To compute the point addition efficiently, this implementation splits
+	// the equation into intermediate elements which are used to minimize
+	// the number of field multiplications using the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// Z1Z1 = Z1^2, Z2Z2 = Z2^2, U1 = X1*Z2Z2, U2 = X2*Z1Z1, S1 = Y1*Z2*Z2Z2
+	// S2 = Y2*Z1*Z1Z1, H = U2-U1, I = (2*H)^2, J = H*I, r = 2*(S2-S1)
+	// V = U1*I
+	// X3 = r^2-J-2*V, Y3 = r*(V-X3)-2*S1*J, Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H
+	//
+	// This results in a cost of 11 field multiplications, 5 field squarings,
+	// 9 field additions, and 4 integer multiplications.
+
+	// When the x coordinates are the same for two points on the curve, the
+	// y coordinates either must be the same, in which case it is point
+	// doubling, or they are opposite and the result is the point at
+	// infinity.  Since any number of Jacobian coordinates can represent the
+	// same affine point, the x and y values need to be converted to like
+	// terms.
+	var z1z1, z2z2, u1, u2, s1, s2 fieldVal
+	z1z1.SquareVal(z1)                        // Z1Z1 = Z1^2 (mag: 1)
+	z2z2.SquareVal(z2)                        // Z2Z2 = Z2^2 (mag: 1)
+	u1.Set(x1).Mul(&z2z2).Normalize()         // U1 = X1*Z2Z2 (mag: 1)
+	u2.Set(x2).Mul(&z1z1).Normalize()         // U2 = X2*Z1Z1 (mag: 1)
+	s1.Set(y1).Mul(&z2z2).Mul(z2).Normalize() // S1 = Y1*Z2*Z2Z2 (mag: 1)
+	s2.Set(y2).Mul(&z1z1).Mul(z1).Normalize() // S2 = Y2*Z1*Z1Z1 (mag: 1)
+	if u1.Equals(&u2) {
+		if s1.Equals(&s2) {
+			// Since x1 == x2 and y1 == y2, point doubling must be
+			// done, otherwise the addition would end up dividing
+			// by zero.
+			curve.doubleJacobian(x1, y1, z1, x3, y3, z3)
+			return
+		}
+
+		// Since x1 == x2 and y1 == -y2, the sum is the point at
+		// infinity per the group law.
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Calculate X3, Y3, and Z3 according to the intermediate elements
+	// breakdown above.
+	var h, i, j, r, rr, v fieldVal
+	var negU1, negS1, negX3 fieldVal
+	negU1.Set(&u1).Negate(1)               // negU1 = -U1 (mag: 2)
+	h.Add2(&u2, &negU1)                    // H = U2-U1 (mag: 3)
+	i.Set(&h).MulInt(2).Square()           // I = (2*H)^2 (mag: 2)
+	j.Mul2(&h, &i)                         // J = H*I (mag: 1)
+	negS1.Set(&s1).Negate(1)               // negS1 = -S1 (mag: 2)
+	r.Set(&s2).Add(&negS1).MulInt(2)       // r = 2*(S2-S1) (mag: 6)
+	rr.SquareVal(&r)                       // rr = r^2 (mag: 1)
+	v.Mul2(&u1, &i)                        // V = U1*I (mag: 1)
+	x3.Set(&v).MulInt(2).Add(&j).Negate(3) // X3 = -(J+2*V) (mag: 4)
+	x3.Add(&rr)                            // X3 = r^2+X3 (mag: 5)
+	negX3.Set(x3).Negate(5)                // negX3 = -X3 (mag: 6)
+	y3.Mul2(&s1, &j).MulInt(2).Negate(2)   // Y3 = -(2*S1*J) (mag: 3)
+	y3.Add(v.Add(&negX3).Mul(&r))          // Y3 = r*(V-X3)+Y3 (mag: 4)
+	z3.Add2(z1, z2).Square()               // Z3 = (Z1+Z2)^2 (mag: 1)
+	z3.Add(z1z1.Add(&z2z2).Negate(2))      // Z3 = Z3-(Z1Z1+Z2Z2) (mag: 4)
+	z3.Mul(&h)                             // Z3 = Z3*H (mag: 1)
+
+	// Normalize the resulting field values to a magnitude of 1 as needed.
+	x3.Normalize()
+	y3.Normalize()
+}
+
+// addJacobian adds the passed Jacobian points (x1, y1, z1) and (x2, y2, z2)
+// together and stores the result in (x3, y3, z3).
+func (curve *KoblitzCurve) addJacobian(x1, y1, z1, x2, y2, z2, x3, y3, z3 *fieldVal) {
+	// A point at infinity is the identity according to the group law for
+	// elliptic curve cryptography.  Thus, ∞ + P = P and P + ∞ = P.
+	if (x1.IsZero() && y1.IsZero()) || z1.IsZero() {
 		x3.Set(x2)
 		y3.Set(y2)
 		z3.Set(z2)
-		return x3, y3, z3
+		return
 	}
-	if z2.Sign() == 0 {
+	if (x2.IsZero() && y2.IsZero()) || z2.IsZero() {
 		x3.Set(x1)
 		y3.Set(y1)
 		z3.Set(z1)
-		return x3, y3, z3
+		return
 	}
 
-	z1z1 := new(big.Int).Mul(z1, z1)
-	z1z1.Mod(z1z1, curve.P)
-	z2z2 := new(big.Int).Mul(z2, z2)
-	z2z2.Mod(z2z2, curve.P)
-
-	u1 := new(big.Int).Mul(x1, z2z2)
-	u1.Mod(u1, curve.P)
-	u2 := new(big.Int).Mul(x2, z1z1)
-	u2.Mod(u2, curve.P)
-	h := new(big.Int).Sub(u2, u1)
-	xEqual := h.Sign() == 0
-	if h.Sign() == -1 {
-		h.Add(h, curve.P)
+	// Faster point addition can be achieved when certain assumptions are
+	// met.  For example, when both points have the same z value, arithmetic
+	// on the z values can be avoided.  This section thus checks for these
+	// conditions and calls an appropriate add function which is accelerated
+	// by using those assumptions.
+	z1.Normalize()
+	z2.Normalize()
+	isZ1One := z1.Equals(fieldOne)
+	isZ2One := z2.Equals(fieldOne)
+	switch {
+	case isZ1One && isZ2One:
+		curve.addZ1AndZ2EqualsOne(x1, y1, x2, y2, x3, y3, z3)
+		return
+	case z1.Equals(z2):
+		curve.addZ1EqualsZ2(x1, y1, z1, x2, y2, x3, y3, z3)
+		return
+	case isZ2One:
+		curve.addZ2EqualsOne(x1, y1, z1, x2, y2, x3, y3, z3)
+		return
 	}
-	i := new(big.Int).Lsh(h, 1)
-	i.Mul(i, i)
-	j := new(big.Int).Mul(h, i)
 
-	s1 := new(big.Int).Mul(y1, z2)
-	s1.Mul(s1, z2z2)
-	s1.Mod(s1, curve.P)
-	s2 := new(big.Int).Mul(y2, z1)
-	s2.Mul(s2, z1z1)
-	s2.Mod(s2, curve.P)
-	r := new(big.Int).Sub(s2, s1)
-	if r.Sign() == -1 {
-		r.Add(r, curve.P)
+	// None of the above assumptions are true, so fall back to generic
+	// point addition.
+	curve.addGeneric(x1, y1, z1, x2, y2, z2, x3, y3, z3)
+}
+
+// Add returns the sum of (x1,y1) and (x2,y2). Part of the elliptic.Curve
+// interface.
+func (curve *KoblitzCurve) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	// A point at infinity is the identity according to the group law for
+	// elliptic curve cryptography.  Thus, ∞ + P = P and P + ∞ = P.
+	if x1.Sign() == 0 && y1.Sign() == 0 {
+		return x2, y2
 	}
-	yEqual := r.Sign() == 0
-	if xEqual && yEqual {
-		return curve.doubleJacobian(x1, y1, z1)
+	if x2.Sign() == 0 && y2.Sign() == 0 {
+		return x1, y1
 	}
-	r.Lsh(r, 1)
-	v := new(big.Int).Mul(u1, i)
 
-	x3.Set(r)
-	x3.Mul(x3, x3)
-	x3.Sub(x3, j)
-	x3.Sub(x3, v)
-	x3.Sub(x3, v)
-	x3.Mod(x3, curve.P)
+	// Convert the affine coordinates from big integers to field values
+	// and do the point addition in Jacobian projective space.
+	fx1, fy1 := curve.bigAffineToField(x1, y1)
+	fx2, fy2 := curve.bigAffineToField(x2, y2)
+	fx3, fy3, fz3 := new(fieldVal), new(fieldVal), new(fieldVal)
+	curve.addJacobian(fx1, fy1, fieldOne, fx2, fy2, fieldOne, fx3, fy3, fz3)
 
-	y3.Set(r)
-	v.Sub(v, x3)
-	y3.Mul(y3, v)
-	s1.Mul(s1, j)
-	s1.Lsh(s1, 1)
-	y3.Sub(y3, s1)
-	y3.Mod(y3, curve.P)
+	// Convert the Jacobian coordinate field values back to affine big
+	// integers.
+	return curve.fieldJacobianToBigAffine(fx3, fy3, fz3)
+}
 
-	z3.Add(z1, z2)
-	z3.Mul(z3, z3)
-	z3.Sub(z3, z1z1)
-	z3.Sub(z3, z2z2)
-	z3.Mul(z3, h)
-	z3.Mod(z3, curve.P)
+// doubleZ1EqualsOne performs point doubling on the passed Jacobian point
+// when the point is already known to have a z value of 1 and stores
+// the result in (x3, y3, z3).  That is to say (x3, y3, z3) = 2*(x1, y1, 1).  It
+// performs faster point doubling than the generic routine since less arithmetic
+// is needed due to the ability to avoid multiplication by the z value.
+func (curve *KoblitzCurve) doubleZ1EqualsOne(x1, y1, x3, y3, z3 *fieldVal) {
+	// This function uses the assumptions that z1 is 1, thus the point
+	// doubling formulas reduce to:
+	//
+	// X3 = (3*X1^2)^2 - 8*X1*Y1^2
+	// Y3 = (3*X1^2)*(4*X1*Y1^2 - X3) - 8*Y1^4
+	// Z3 = 2*Y1
+	//
+	// To compute the above efficiently, this implementation splits the
+	// equation into intermediate elements which are used to minimize the
+	// number of field multiplications in favor of field squarings which
+	// are roughly 35% faster than field multiplications with the current
+	// implementation at the time this was written.
+	//
+	// This uses a slightly modified version of the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-mdbl-2007-bl
+	//
+	// In particular it performs the calculations using the following:
+	// A = X1^2, B = Y1^2, C = B^2, D = 2*((X1+B)^2-A-C)
+	// E = 3*A, F = E^2, X3 = F-2*D, Y3 = E*(D-X3)-8*C
+	// Z3 = 2*Y1
+	//
+	// This results in a cost of 1 field multiplication, 5 field squarings,
+	// 6 field additions, and 5 integer multiplications.
+	var a, b, c, d, e, f fieldVal
+	z3.Set(y1).MulInt(2)                     // Z3 = 2*Y1 (mag: 2)
+	a.SquareVal(x1)                          // A = X1^2 (mag: 1)
+	b.SquareVal(y1)                          // B = Y1^2 (mag: 1)
+	c.SquareVal(&b)                          // C = B^2 (mag: 1)
+	b.Add(x1).Square()                       // B = (X1+B)^2 (mag: 1)
+	d.Set(&a).Add(&c).Negate(2)              // D = -(A+C) (mag: 3)
+	d.Add(&b).MulInt(2)                      // D = 2*(B+D)(mag: 8)
+	e.Set(&a).MulInt(3)                      // E = 3*A (mag: 3)
+	f.SquareVal(&e)                          // F = E^2 (mag: 1)
+	x3.Set(&d).MulInt(2).Negate(16)          // X3 = -(2*D) (mag: 17)
+	x3.Add(&f)                               // X3 = F+X3 (mag: 18)
+	f.Set(x3).Negate(18).Add(&d).Normalize() // F = D-X3 (mag: 1)
+	y3.Set(&c).MulInt(8).Negate(8)           // Y3 = -(8*C) (mag: 9)
+	y3.Add(f.Mul(&e))                        // Y3 = E*F+Y3 (mag: 10)
 
-	return x3, y3, z3
+	// Normalize the field values back to a magnitude of 1.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
+}
+
+// doubleGeneric performs point doubling on the passed Jacobian point without
+// any assumptions about the z value and stores the result in (x3, y3, z3).
+// That is to say (x3, y3, z3) = 2*(x1, y1, z1).  It is the slowest of the point
+// doubling routines due to requiring the most arithmetic.
+func (cuve *KoblitzCurve) doubleGeneric(x1, y1, z1, x3, y3, z3 *fieldVal) {
+	// Point doubling formula for Jacobian coordinates for the secp256k1
+	// curve:
+	// X3 = (3*X1^2)^2 - 8*X1*Y1^2
+	// Y3 = (3*X1^2)*(4*X1*Y1^2 - X3) - 8*Y1^4
+	// Z3 = 2*Y1*Z1
+	//
+	// To compute the above efficiently, this implementation splits the
+	// equation into intermediate elements which are used to minimize the
+	// number of field multiplications in favor of field squarings which
+	// are roughly 35% faster than field multiplications with the current
+	// implementation at the time this was written.
+	//
+	// This uses a slightly modified version of the method shown at:
+	// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+	//
+	// In particular it performs the calculations using the following:
+	// A = X1^2, B = Y1^2, C = B^2, D = 2*((X1+B)^2-A-C)
+	// E = 3*A, F = E^2, X3 = F-2*D, Y3 = E*(D-X3)-8*C
+	// Z3 = 2*Y1*Z1
+	//
+	// This results in a cost of 1 field multiplication, 5 field squarings,
+	// 6 field additions, and 5 integer multiplications.
+	var a, b, c, d, e, f fieldVal
+	z3.Mul2(y1, z1).MulInt(2)                // Z3 = 2*Y1*Z1 (mag: 2)
+	a.SquareVal(x1)                          // A = X1^2 (mag: 1)
+	b.SquareVal(y1)                          // B = Y1^2 (mag: 1)
+	c.SquareVal(&b)                          // C = B^2 (mag: 1)
+	b.Add(x1).Square()                       // B = (X1+B)^2 (mag: 1)
+	d.Set(&a).Add(&c).Negate(2)              // D = -(A+C) (mag: 3)
+	d.Add(&b).MulInt(2)                      // D = 2*(B+D)(mag: 8)
+	e.Set(&a).MulInt(3)                      // E = 3*A (mag: 3)
+	f.SquareVal(&e)                          // F = E^2 (mag: 1)
+	x3.Set(&d).MulInt(2).Negate(16)          // X3 = -(2*D) (mag: 17)
+	x3.Add(&f)                               // X3 = F+X3 (mag: 18)
+	f.Set(x3).Negate(18).Add(&d).Normalize() // F = D-X3 (mag: 1)
+	y3.Set(&c).MulInt(8).Negate(8)           // Y3 = -(8*C) (mag: 9)
+	y3.Add(f.Mul(&e))                        // Y3 = E*F+Y3 (mag: 10)
+
+	// Normalize the field values back to a magnitude of 1.
+	x3.Normalize()
+	y3.Normalize()
+	z3.Normalize()
+}
+
+// doubleJacobian doubles the passed Jacobian point (x1, y1, z1) and stores the
+// result in (x3, y3, z3).
+func (curve *KoblitzCurve) doubleJacobian(x1, y1, z1, x3, y3, z3 *fieldVal) {
+	// Doubling a point at infinity is still infinity.
+	if y1.IsZero() || z1.IsZero() {
+		x3.SetInt(0)
+		y3.SetInt(0)
+		z3.SetInt(0)
+		return
+	}
+
+	// Slightly faster point doubling can be achieved when the z value is 1
+	// by avoiding the multiplication on the z value.  This section calls
+	// a point doubling function which is accelerated by using that
+	// assumption when possible.
+	if z1.Normalize().Equals(fieldOne) {
+		curve.doubleZ1EqualsOne(x1, y1, x3, y3, z3)
+		return
+	}
+
+	// Fall back to generic point doubling which works with arbitrary z
+	// values.
+	curve.doubleGeneric(x1, y1, z1, x3, y3, z3)
 }
 
 // Double returns 2*(x1,y1). Part of the elliptic.Curve interface.
 func (curve *KoblitzCurve) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
-	z1 := zForAffine(x1, y1)
-	return curve.affineFromJacobian(curve.doubleJacobian(x1, y1, z1))
-}
+	if y1.Sign() == 0 {
+		return new(big.Int), new(big.Int)
+	}
 
-// doubleJacobian takes a point in Jacobian coordinates, (x, y, z), and
-// returns its double, also in Jacobian form.
-func (curve *KoblitzCurve) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int, *big.Int) {
-	// See http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+	// Convert the affine coordinates from big integers to field values
+	// and do the point doubling in Jacobian projective space.
+	fx1, fy1 := curve.bigAffineToField(x1, y1)
+	fx3, fy3, fz3 := new(fieldVal), new(fieldVal), new(fieldVal)
+	curve.doubleJacobian(fx1, fy1, fieldOne, fx3, fy3, fz3)
 
-	a := new(big.Int).Mul(x, x) //X1²
-	b := new(big.Int).Mul(y, y) //Y1²
-	c := new(big.Int).Mul(b, b) //B²
-
-	d := new(big.Int).Add(x, b) //X1+B
-	d.Mul(d, d)                 //(X1+B)²
-	d.Sub(d, a)                 //(X1+B)²-A
-	d.Sub(d, c)                 //(X1+B)²-A-C
-	d.Mul(d, big.NewInt(2))     //2*((X1+B)²-A-C)
-
-	e := new(big.Int).Mul(big.NewInt(3), a) //3*A
-	f := new(big.Int).Mul(e, e)             //E²
-
-	x3 := new(big.Int).Mul(big.NewInt(2), d) //2*D
-	x3.Sub(f, x3)                            //F-2*D
-	x3.Mod(x3, curve.P)
-
-	y3 := new(big.Int).Sub(d, x3)                  //D-X3
-	y3.Mul(e, y3)                                  //E*(D-X3)
-	y3.Sub(y3, new(big.Int).Mul(big.NewInt(8), c)) //E*(D-X3)-8*C
-	y3.Mod(y3, curve.P)
-
-	z3 := new(big.Int).Mul(y, z) //Y1*Z1
-	z3.Mul(big.NewInt(2), z3)    //3*Y1*Z1
-	z3.Mod(z3, curve.P)
-
-	return x3, y3, z3
+	// Convert the Jacobian coordinate field values back to affine big
+	// integers.
+	return curve.fieldJacobianToBigAffine(fx3, fy3, fz3)
 }
 
 // ScalarMult returns k*(Bx, By) where k is a big endian integer.
 // Part of the elliptic.Curve interface.
 func (curve *KoblitzCurve) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) {
-	Bz := new(big.Int).SetInt64(1)
-	x, y, z := new(big.Int), new(big.Int), new(big.Int)
+	// This uses the left to right binary method for point multiplication:
 
-	for _, byte := range k {
+	// Point Q = ∞ (point at infinity).
+	qx, qy, qz := new(fieldVal), new(fieldVal), new(fieldVal)
+
+	// Point P = the point to multiply the scalar with.
+	px, py := curve.bigAffineToField(Bx, By)
+	pz := fieldOne
+
+	// Double and add as necessary depending on the bits set in the scalar.
+	for _, byteVal := range k {
 		for bitNum := 0; bitNum < 8; bitNum++ {
-			x, y, z = curve.doubleJacobian(x, y, z)
-			if byte&0x80 == 0x80 {
-				x, y, z = curve.addJacobian(Bx, By, Bz, x, y, z)
+			// Q = 2*Q
+			curve.doubleJacobian(qx, qy, qz, qx, qy, qz)
+			if byteVal&0x80 == 0x80 {
+				// Q = Q + P
+				curve.addJacobian(qx, qy, qz, px, py, pz, qx,
+					qy, qz)
 			}
-			byte <<= 1
+			byteVal <<= 1
 		}
 	}
 
-	return curve.affineFromJacobian(x, y, z)
+	// Convert the Jacobian coordinate field values back to affine big.Ints.
+	return curve.fieldJacobianToBigAffine(qx, qy, qz)
 }
 
 // ScalarBaseMult returns k*G where G is the base point of the group and k is a
@@ -247,7 +642,7 @@ func initAll() {
 }
 
 func initS256() {
-	// See SEC 2 section 2.7.1
+	// See [SECG] section 2.7.1
 	secp256k1.CurveParams = new(elliptic.CurveParams)
 	secp256k1.P, _ = new(big.Int).SetString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F", 16)
 	secp256k1.N, _ = new(big.Int).SetString("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141", 16)
diff --git a/field.go b/field.go
new file mode 100644
index 00000000..ac123e9e
--- /dev/null
+++ b/field.go
@@ -0,0 +1,1270 @@
+// Copyright (c) 2013 Conformal Systems LLC.
+// Copyright (c) 2013 Dave Collins
+// Use of this source code is governed by an ISC
+// license that can be found in the LICENSE file.
+
+package btcec
+
+// References:
+//   [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
+//     http://cacr.uwaterloo.ca/hac/
+
+// All elliptic curve operations for secp256k1 are done in a finite field
+// characterized by a 256-bit prime.  Given this precision is larger than the
+// biggest available native type, obviously some form of bignum math is needed.
+// This package implements specialized fixed-precision field arithmetic rather
+// than relying on an arbitrary-precision arithmetic package such as math/big
+// for dealing with the field math since the size is known.  As a result, rather
+// large performance gains are achieved by taking advantage of many
+// optimizations not available to arbitrary-precision arithmetic and generic
+// modular arithmetic algorithms.
+//
+// There are various ways to internally represent each finite field element.
+// For example, the most obvious representation would be to use an array of 4
+// uint64s (64 bits * 4 = 256 bits).  However, that representation suffers from
+// a couple of issues.  First, there is no native Go type large enough to handle
+// the intermediate results while adding or multiplying two 64-bit numbers, and
+// second there is no space left for overflows when performing the intermediate
+// arithmetic between each array element which would lead to expensive carry
+// propagation.
+//
+// Given the above, this implementation represents the the field elements as
+// 10 uint32s with each word (array entry) treated as base 2^26.  This was
+// chosen for the following reasons:
+// 1) Most systems at the current time are 64-bit (or at least have 64-bit
+//    registers available for specialized purposes such as MMX) so the
+//    intermediate results can typically be done using a native register (and
+//    using uint64s to avoid the need for additional half-word arithmetic)
+// 2) In order to allow addition of the internal words without having to
+//    propagate the the carry, the max normalized value for each register must
+//    be less than the number of bits available in the register
+// 3) Since we're dealing with 32-bit values, 64-bits of overflow is a
+//    reasonable choice for #2
+// 4) Given the need for 256-bits of precision and the properties stated in #1,
+//    #2, and #3, the representation which best accomodates this is 10 uint32s
+//    with base 2^26 (26 bits * 10 = 260 bits, so the final word only needs 22
+//    bits) which leaves the desired 64 bits (32 * 10 = 320, 320 - 256 = 64) for
+//    overflow
+//
+// Since it is so important that the field arithmetic is extremely fast for
+// high performance crypto, this package does not perform any validation where
+// it ordinarily would.  For example, some functions only give the correct
+// result is the field is normalized and there is no checking to ensure it is.
+// While I typically prefer to ensure all state and input is valid for most
+// packages, this code is really only used internally and every extra check
+// counts.
+
+import (
+	"encoding/hex"
+)
+
+// Constants used to make the code more readable.
+const (
+	twoBitsMask   = 0x3
+	fourBitsMask  = 0xf
+	sixBitsMask   = 0x3f
+	eightBitsMask = 0xff
+)
+
+// Constants related to the field representation.
+const (
+	// fieldWords is the number of words used to internally represent the
+	// 256-bit value.
+	fieldWords = 10
+
+	// fieldBase is the exponent used to form the numeric base of each word.
+	// 2^(fieldBase*i) where i is the word position.
+	fieldBase = 26
+
+	// fieldOverflowBits is the minimum number of "overflow" bits for each
+	// word in the field value.
+	fieldOverflowBits = 32 - fieldBase
+
+	// fieldBaseMask is the mask for the bits in each word needed to
+	// represent the numeric base of each word (except the most significant
+	// word).
+	fieldBaseMask = (1 << fieldBase) - 1
+
+	// fieldMSBBits is the number of bits in the most significant word used
+	// to represent the value.
+	fieldMSBBits = 256 - (fieldBase * (fieldWords - 1))
+
+	// fieldMSBMask is the mask for the bits in the most significant word
+	// needed to represent the value.
+	fieldMSBMask = (1 << fieldMSBBits) - 1
+
+	// fieldPrimeWordZero is word zero of the secp256k1 prime in the
+	// internal field representation.  It is used during modular reduction
+	// and negation.
+	fieldPrimeWordZero = 0x3fffc2f
+
+	// fieldPrimeWordOne is word one of the secp256k1 prime in the
+	// internal field representation.  It is used during modular reduction
+	// and negation.
+	fieldPrimeWordOne = 0x3ffffbf
+)
+
+// fieldVal implements optimized fixed-precision arithmetic over the
+// secp256k1 finite field.  This means all arithmetic is performed modulo
+// 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f.  It
+// represents each 256-bit value as 10 32-bit integers in base 2^26.  This
+// provides 6 bits of overflow in each word (10 bits in the most significant
+// word) for a total of 64 bits of overflow (9*6 + 10 = 64).  It only implements
+// the arithmetic needed for elliptic curve operations.
+//
+// The following depicts the internal representation:
+// 	 -----------------------------------------------------------------
+// 	|        n[9]       |        n[8]       | ... |        n[0]       |
+// 	| 32 bits available | 32 bits available | ... | 32 bits available |
+// 	| 22 bits for value | 26 bits for value | ... | 26 bits for value |
+// 	| 10 bits overflow  |  6 bits overflow  | ... |  6 bits overflow  |
+// 	| Mult: 2^(26*9)    | Mult: 2^(26*8)    | ... | Mult: 2^(26*0)    |
+// 	 -----------------------------------------------------------------
+//
+// For example, consider the number 2^49 + 1.  It would be represented as:
+// 	n[0] = 1
+// 	n[1] = 2^23
+// 	n[2..9] = 0
+//
+// The full 256-bit value is then calculated by looping i from 9..0 and
+// doing sum(n[i] * 2^(26i)) like so:
+// 	n[9] * 2^(26*9) = 0    * 2^234 = 0
+// 	n[8] * 2^(26*8) = 0    * 2^208 = 0
+// 	...
+// 	n[1] * 2^(26*1) = 2^23 * 2^26  = 2^49
+// 	n[0] * 2^(26*0) = 1    * 2^0   = 1
+// 	Sum: 0 + 0 + ... + 2^49 + 1 = 2^49 + 1
+type fieldVal struct {
+	n [10]uint32
+}
+
+// String returns the field value as a human-readable hex string.
+func (f fieldVal) String() string {
+	t := new(fieldVal).Set(&f).Normalize()
+	return hex.EncodeToString(t.Bytes()[:])
+}
+
+// Zero sets the field value to zero.  A newly created field value is already
+// set to zero.  This function can be useful to clear an existing field value
+// for reuse.
+func (f *fieldVal) Zero() {
+	f.n[0] = 0
+	f.n[1] = 0
+	f.n[2] = 0
+	f.n[3] = 0
+	f.n[4] = 0
+	f.n[5] = 0
+	f.n[6] = 0
+	f.n[7] = 0
+	f.n[8] = 0
+	f.n[9] = 0
+}
+
+// Set sets the field value equal to the passed value.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).Set(f2).Add(1) so that f = f2 + 1 where f2 is not
+// modified.
+func (f *fieldVal) Set(val *fieldVal) *fieldVal {
+	*f = *val
+	return f
+}
+
+// SetInt sets the field value to the passed integer.  This is a convenience
+// function since it is fairly common to perform some arithemetic with small
+// native integers.
+//
+// The field value is returned to support chaining.  This enables syntax such
+// as f := new(fieldVal).SetInt(2).Mul(f2) so that f = 2 * f2.
+func (f *fieldVal) SetInt(ui uint) *fieldVal {
+	f.Zero()
+	f.n[0] = uint32(ui)
+	return f
+}
+
+// SetBytes packs the passed 32-byte big-endian value into the internal field
+// value representation.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).SetBytes(byteArray).Mul(f2) so that f = ba * f2.
+func (f *fieldVal) SetBytes(b *[32]byte) *fieldVal {
+	// Pack the 256 total bits across the 10 uint32 words with a max of
+	// 26-bits per word.  This could be done with a couple of for loops,
+	// but this unrolled version is significantly faster.  Benchmarks show
+	// this is about 34 times faster than the variant which uses loops.
+	f.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 |
+		(uint32(b[28])&twoBitsMask)<<24
+	f.n[1] = uint32(b[28])>>2 | uint32(b[27])<<6 | uint32(b[26])<<14 |
+		(uint32(b[25])&fourBitsMask)<<22
+	f.n[2] = uint32(b[25])>>4 | uint32(b[24])<<4 | uint32(b[23])<<12 |
+		(uint32(b[22])&sixBitsMask)<<20
+	f.n[3] = uint32(b[22])>>6 | uint32(b[21])<<2 | uint32(b[20])<<10 |
+		uint32(b[19])<<18
+	f.n[4] = uint32(b[18]) | uint32(b[17])<<8 | uint32(b[16])<<16 |
+		(uint32(b[15])&twoBitsMask)<<24
+	f.n[5] = uint32(b[15])>>2 | uint32(b[14])<<6 | uint32(b[13])<<14 |
+		(uint32(b[12])&fourBitsMask)<<22
+	f.n[6] = uint32(b[12])>>4 | uint32(b[11])<<4 | uint32(b[10])<<12 |
+		(uint32(b[9])&sixBitsMask)<<20
+	f.n[7] = uint32(b[9])>>6 | uint32(b[8])<<2 | uint32(b[7])<<10 |
+		uint32(b[6])<<18
+	f.n[8] = uint32(b[5]) | uint32(b[4])<<8 | uint32(b[3])<<16 |
+		(uint32(b[2])&twoBitsMask)<<24
+	f.n[9] = uint32(b[2])>>2 | uint32(b[1])<<6 | uint32(b[0])<<14
+	return f
+}
+
+// SetByteSlice packs the passed big-endian value into the internal field value
+// representation.  Only the first 32-bytes are used.  As a result, it is up to
+// the caller to ensure numbers of the appropriate size are used or the value
+// will be truncated.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).SetByteSlice(byteSlice)
+func (f *fieldVal) SetByteSlice(b []byte) *fieldVal {
+	var b32 [32]byte
+	for i := 0; i < len(b); i++ {
+		if i < 32 {
+			b32[i+(32-len(b))] = b[i]
+		}
+	}
+	return f.SetBytes(&b32)
+}
+
+// SetHex decodes the passed big-endian hex string into the internal field value
+// representation.  Only the first 32-bytes are used.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f := new(fieldVal).SetHex("0abc").Add(1) so that f = 0x0abc + 1
+func (f *fieldVal) SetHex(hexString string) *fieldVal {
+	if len(hexString)%2 != 0 {
+		hexString = "0" + hexString
+	}
+	bytes, _ := hex.DecodeString(hexString)
+	return f.SetByteSlice(bytes)
+}
+
+// Normalize normalizes the internal field words into the desired range and
+// performs fast modular reduction over the secp256k1 prime by making use of the
+// special form of the prime.
+func (f *fieldVal) Normalize() *fieldVal {
+	// The field representation leaves 6 bits of overflow in each
+	// word so intermediate calculations can be performed without needing
+	// to propagate the carry to each higher word during the calculations.
+	// In order to normalize, first we need to "compact" the full 256-bit
+	// value to the right and treat the additional 64 leftmost bits as
+	// the magnitude.
+	m := f.n[0]
+	t0 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[1]
+	t1 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[2]
+	t2 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[3]
+	t3 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[4]
+	t4 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[5]
+	t5 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[6]
+	t6 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[7]
+	t7 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[8]
+	t8 := m & fieldBaseMask
+	m = (m >> fieldBase) + f.n[9]
+	t9 := m & fieldMSBMask
+	m = m >> fieldMSBBits
+
+	// At this point, if the magnitude is greater than 0, the overall value
+	// is greater than the max possible 256-bit value.  In particular, it is
+	// "how many times larger" than the max value it is.  Since this field
+	// is doing arithmetic modulo the secp256k1 prime, we need to perform
+	// modular reduction over the prime.
+	//
+	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
+	// when the modulus is of the special form m = b^t - c, highly efficient
+	// reduction can be achieved.
+	//
+	// The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
+	// this criteria.
+	//
+	// 4294968273 in field representation (base 2^26) is:
+	// n[0] = 977
+	// n[1] = 64
+	// That is to say (2^26 * 64) + 977 = 4294968273
+	//
+	// The algorithm presented in the referenced section typically repeats
+	// until the quotient is zero.  However, due to our field representation
+	// we already know at least how many times we would need to repeat as
+	// it's the value currently in m.  Thus we can simply multiply the
+	// magnitude by the field representation of the prime and do a single
+	// iteration.  Notice that nothing will be changed when the magnitude is
+	// zero, so we could skip this in that case, however always running
+	// regardless allows it to run in constant time.
+	r := t0 + m*977
+	t0 = r & fieldBaseMask
+	r = (r >> fieldBase) + t1 + m*64
+	t1 = r & fieldBaseMask
+	r = (r >> fieldBase) + t2
+	t2 = r & fieldBaseMask
+	r = (r >> fieldBase) + t3
+	t3 = r & fieldBaseMask
+	r = (r >> fieldBase) + t4
+	t4 = r & fieldBaseMask
+	r = (r >> fieldBase) + t5
+	t5 = r & fieldBaseMask
+	r = (r >> fieldBase) + t6
+	t6 = r & fieldBaseMask
+	r = (r >> fieldBase) + t7
+	t7 = r & fieldBaseMask
+	r = (r >> fieldBase) + t8
+	t8 = r & fieldBaseMask
+	r = (r >> fieldBase) + t9
+	t9 = r & fieldMSBMask
+
+	// At this point, the result will be in the range 0 <= result <=
+	// prime + (2^64 - c).  Therefore, one more subtraction of the prime
+	// might be needed if the current result is greater than or equal to the
+	// prime.  The following does the final reduction in constant time.
+	// Note that the if/else here intentionally does the bitwise OR with
+	// zero even though it won't change the value to ensure constant time
+	// between the branches.
+	var mask int32
+	if t0 < fieldPrimeWordZero {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t1 < fieldPrimeWordOne {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t2 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t3 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t4 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t5 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t6 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t7 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t8 < fieldBaseMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	if t9 < fieldMSBMask {
+		mask |= -1
+	} else {
+		mask |= 0
+	}
+	t0 = t0 - uint32(^mask&fieldPrimeWordZero)
+	t1 = t1 - uint32(^mask&fieldPrimeWordOne)
+	t2 = t2 & uint32(mask)
+	t3 = t3 & uint32(mask)
+	t4 = t4 & uint32(mask)
+	t5 = t5 & uint32(mask)
+	t6 = t6 & uint32(mask)
+	t7 = t7 & uint32(mask)
+	t8 = t8 & uint32(mask)
+	t9 = t9 & uint32(mask)
+
+	// Finally, set the normalized and reduced words.
+	f.n[0] = t0
+	f.n[1] = t1
+	f.n[2] = t2
+	f.n[3] = t3
+	f.n[4] = t4
+	f.n[5] = t5
+	f.n[6] = t6
+	f.n[7] = t7
+	f.n[8] = t8
+	f.n[9] = t9
+	return f
+}
+
+// PutBytes unpacks the field value to a 32-byte big-endian value using the
+// passed byte array.  There is a similar function, Bytes, which unpacks the
+// field value into a new array and returns that.  This version is provided
+// since it can be useful to cut down on the number of allocations by allowing
+// the caller to reuse a buffer.
+//
+// The field value must be normalized for this function to return the correct
+// result.
+func (f *fieldVal) PutBytes(b *[32]byte) {
+	// Unpack the 256 total bits from the 10 uint32 words with a max of
+	// 26-bits per word.  This could be done with a couple of for loops,
+	// but this unrolled version is a bit faster.  Benchmarks show this is
+	// about 10 times faster than the variant which uses loops.
+	b[31] = byte(f.n[0] & eightBitsMask)
+	b[30] = byte((f.n[0] >> 8) & eightBitsMask)
+	b[29] = byte((f.n[0] >> 16) & eightBitsMask)
+	b[28] = byte((f.n[0]>>24)&twoBitsMask | (f.n[1]&sixBitsMask)<<2)
+	b[27] = byte((f.n[1] >> 6) & eightBitsMask)
+	b[26] = byte((f.n[1] >> 14) & eightBitsMask)
+	b[25] = byte((f.n[1]>>22)&fourBitsMask | (f.n[2]&fourBitsMask)<<4)
+	b[24] = byte((f.n[2] >> 4) & eightBitsMask)
+	b[23] = byte((f.n[2] >> 12) & eightBitsMask)
+	b[22] = byte((f.n[2]>>20)&sixBitsMask | (f.n[3]&twoBitsMask)<<6)
+	b[21] = byte((f.n[3] >> 2) & eightBitsMask)
+	b[20] = byte((f.n[3] >> 10) & eightBitsMask)
+	b[19] = byte((f.n[3] >> 18) & eightBitsMask)
+	b[18] = byte(f.n[4] & eightBitsMask)
+	b[17] = byte((f.n[4] >> 8) & eightBitsMask)
+	b[16] = byte((f.n[4] >> 16) & eightBitsMask)
+	b[15] = byte((f.n[4]>>24)&twoBitsMask | (f.n[5]&sixBitsMask)<<2)
+	b[14] = byte((f.n[5] >> 6) & eightBitsMask)
+	b[13] = byte((f.n[5] >> 14) & eightBitsMask)
+	b[12] = byte((f.n[5]>>22)&fourBitsMask | (f.n[6]&fourBitsMask)<<4)
+	b[11] = byte((f.n[6] >> 4) & eightBitsMask)
+	b[10] = byte((f.n[6] >> 12) & eightBitsMask)
+	b[9] = byte((f.n[6]>>20)&sixBitsMask | (f.n[7]&twoBitsMask)<<6)
+	b[8] = byte((f.n[7] >> 2) & eightBitsMask)
+	b[7] = byte((f.n[7] >> 10) & eightBitsMask)
+	b[6] = byte((f.n[7] >> 18) & eightBitsMask)
+	b[5] = byte(f.n[8] & eightBitsMask)
+	b[4] = byte((f.n[8] >> 8) & eightBitsMask)
+	b[3] = byte((f.n[8] >> 16) & eightBitsMask)
+	b[2] = byte((f.n[8]>>24)&twoBitsMask | (f.n[9]&sixBitsMask)<<2)
+	b[1] = byte((f.n[9] >> 6) & eightBitsMask)
+	b[0] = byte((f.n[9] >> 14) & eightBitsMask)
+}
+
+// Bytes unpacks the field value to a 32-byte big-endian value.  See PutBytes
+// for a variant that allows the a buffer to be passed which can be useful to
+// to cut down on the number of allocations by allowing the caller to reuse a
+// buffer.
+//
+// The field value must be normalized for this function to return correct
+// result.
+func (f *fieldVal) Bytes() *[32]byte {
+	b := new([32]byte)
+	f.PutBytes(b)
+	return b
+}
+
+// IsZero returns whether or not the field value is equal to zero.
+func (f *fieldVal) IsZero() bool {
+	// The value can only be zero if no bits are set in any of the words.
+	// This is a constant time implementation.
+	bits := f.n[0] | f.n[1] | f.n[2] | f.n[3] | f.n[4] |
+		f.n[5] | f.n[6] | f.n[7] | f.n[8] | f.n[9]
+
+	return bits == 0
+}
+
+// IsOdd returns whether or not the field value is an odd number.
+//
+// The field value must be normalized for this function to return correct
+// result.
+func (f *fieldVal) IsOdd() bool {
+	// Only odd numbers have the bottom bit set.
+	return f.n[0]&1 == 1
+}
+
+// Equals returns whether or not the two field values are the same.  Both
+// field values being compared must be normalized for this function to return
+// the correct result.
+func (f *fieldVal) Equals(val *fieldVal) bool {
+	// Xor only sets bits when they are different, so the two field values
+	// can only be the same if no bits are set after xoring each word.
+	// This is a constant time implementation.
+	bits := (f.n[0] ^ val.n[0]) | (f.n[1] ^ val.n[1]) | (f.n[2] ^ val.n[2]) |
+		(f.n[3] ^ val.n[3]) | (f.n[4] ^ val.n[4]) | (f.n[5] ^ val.n[5]) |
+		(f.n[6] ^ val.n[6]) | (f.n[7] ^ val.n[7]) | (f.n[8] ^ val.n[8]) |
+		(f.n[9] ^ val.n[9])
+
+	return bits == 0
+}
+
+// NegateVal negates the passed value and stores the result in f.  The caller
+// must provide the magnitude of the passed value for a correct result.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.NegateVal(f2).AddInt(1) so that f = -f2 + 1.
+func (f *fieldVal) NegateVal(val *fieldVal, magnitude uint32) *fieldVal {
+	// Negation in the field is just the prime minus the value.  However,
+	// in order to allow negation against a field value without having to
+	// normalize/reduce it first, multiply by the magnitude (that is how
+	// "far" away it is from the normalized value) to adjust.  Also, since
+	// negating a value pushes it one more order of magnitude away from the
+	// normalized range, add 1 to compensate.
+	//
+	// For some intuition here, imagine you're performing mod 12 arithmetic
+	// (picture a clock) and you are negating the number 7.  So you start at
+	// 12 (which is of course 0 under mod 12) and count backwards (left on
+	// the clock) 7 times to arrive at 5.  Notice this is just 12-7 = 5.
+	// Now, assume you're starting with 19, which is a number that is
+	// already larger than the modulus and congruent to 7 (mod 12).  When a
+	// value is already in the desired range, its magnitude is 1.  Since 19
+	// is an additional "step", its magnitude (mod 12) is 2.  Since any
+	// multiple of the modulus is conguent to zero (mod m), the answer can
+	// be shortcut by simply mulplying the magnitude by the modulus and
+	// subtracting.  Keeping with the example, this would be (2*12)-19 = 5.
+	f.n[0] = (magnitude+1)*fieldPrimeWordZero - val.n[0]
+	f.n[1] = (magnitude+1)*fieldPrimeWordOne - val.n[1]
+	f.n[2] = (magnitude+1)*fieldBaseMask - val.n[2]
+	f.n[3] = (magnitude+1)*fieldBaseMask - val.n[3]
+	f.n[4] = (magnitude+1)*fieldBaseMask - val.n[4]
+	f.n[5] = (magnitude+1)*fieldBaseMask - val.n[5]
+	f.n[6] = (magnitude+1)*fieldBaseMask - val.n[6]
+	f.n[7] = (magnitude+1)*fieldBaseMask - val.n[7]
+	f.n[8] = (magnitude+1)*fieldBaseMask - val.n[8]
+	f.n[9] = (magnitude+1)*fieldMSBMask - val.n[9]
+
+	return f
+}
+
+// Negate negates the field value.  The existing field value is modified.  The
+// caller must provide the magnitude of the field value for a correct result.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Negate().AddInt(1) so that f = -f + 1.
+func (f *fieldVal) Negate(magnitude uint32) *fieldVal {
+	return f.NegateVal(f, magnitude)
+}
+
+// AddInt adds the passed integer to the existing field value and stores the
+// result in f.  This is a convenience function since it is fairly common to
+// perform some arithemetic with small native integers.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.AddInt(1).Add(f2) so that f = f + 1 + f2.
+func (f *fieldVal) AddInt(ui uint) *fieldVal {
+	// Since the field representation intentionally provides overflow bits,
+	// it's ok to use carryless addition as the carry bit is safely part of
+	// the word and will be normalized out.
+	f.n[0] += uint32(ui)
+
+	return f
+}
+
+// Add adds the passed value to the existing field value and stores the result
+// in f.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Add(f2).AddInt(1) so that f = f + f2 + 1.
+func (f *fieldVal) Add(val *fieldVal) *fieldVal {
+	// Since the field representation intentionally provides overflow bits,
+	// it's ok to use carryless addition as the carry bit is safely part of
+	// each word and will be normalized out.  This could obviously be done
+	// in a loop, but the unrolled version is faster.
+	f.n[0] += val.n[0]
+	f.n[1] += val.n[1]
+	f.n[2] += val.n[2]
+	f.n[3] += val.n[3]
+	f.n[4] += val.n[4]
+	f.n[5] += val.n[5]
+	f.n[6] += val.n[6]
+	f.n[7] += val.n[7]
+	f.n[8] += val.n[8]
+	f.n[9] += val.n[9]
+
+	return f
+}
+
+// Add2 adds the passed two field values together and stores the result in f.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f3.Add2(f, f2).AddInt(1) so that f3 = f + f2 + 1.
+func (f *fieldVal) Add2(val *fieldVal, val2 *fieldVal) *fieldVal {
+	// Since the field representation intentionally provides overflow bits,
+	// it's ok to use carryless addition as the carry bit is safely part of
+	// each word and will be normalized out.  This could obviously be done
+	// in a loop, but the unrolled version is faster.
+	f.n[0] = val.n[0] + val2.n[0]
+	f.n[1] = val.n[1] + val2.n[1]
+	f.n[2] = val.n[2] + val2.n[2]
+	f.n[3] = val.n[3] + val2.n[3]
+	f.n[4] = val.n[4] + val2.n[4]
+	f.n[5] = val.n[5] + val2.n[5]
+	f.n[6] = val.n[6] + val2.n[6]
+	f.n[7] = val.n[7] + val2.n[7]
+	f.n[8] = val.n[8] + val2.n[8]
+	f.n[9] = val.n[9] + val2.n[9]
+
+	return f
+}
+
+// MulInt multiplies the field value by the passed int and stores the result in
+// f.  Note that this function can overflow if multiplying the value by any of
+// the individual words exceeds a max uint32.  Therefore it is important that
+// the caller ensures no overflows will occur before using this function.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.MulInt(2).Add(f2) so that f = 2 * f + f2.
+func (f *fieldVal) MulInt(val uint) *fieldVal {
+	// Since each word of the field representation can hold up to
+	// fieldOverflowBits extra bits which will be normalized out, it's safe
+	// to multiply each word without using a larger type or carry
+	// propagation so long as the values won't overflow a uint32.  This
+	// could obviously be done in a loop, but the unrolled version is
+	// faster.
+	ui := uint32(val)
+	f.n[0] *= ui
+	f.n[1] *= ui
+	f.n[2] *= ui
+	f.n[3] *= ui
+	f.n[4] *= ui
+	f.n[5] *= ui
+	f.n[6] *= ui
+	f.n[7] *= ui
+	f.n[8] *= ui
+	f.n[9] *= ui
+
+	return f
+}
+
+// Mul multiplies the passed value to the existing field value and stores the
+// result in f.  Note that this function can overflow if multiplying any
+// of the individual words exceeds a max uint32.  In practice, this means the
+// magnitude of either value invovled in the multiplication must be a max of
+// 8.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Mul(f2).AddInt(1) so that f = (f * f2) + 1.
+func (f *fieldVal) Mul(val *fieldVal) *fieldVal {
+	return f.Mul2(f, val)
+}
+
+// Mul2 multiplies the passed two field values together and stores the result
+// result in f.  Note that this function can overflow if multiplying any of
+// the individual words exceeds a max uint32.  In practice, this means the
+// magnitude of either value invovled in the multiplication must be a max of
+// 8.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f3.Mul2(f, f2).AddInt(1) so that f3 = (f * f2) + 1.
+func (f *fieldVal) Mul2(val *fieldVal, val2 *fieldVal) *fieldVal {
+	// This could be done with a couple of for loops and an array to store
+	// the intermediate terms, but this unrolled version is significantly
+	// faster.
+
+	// Terms for 2^(fieldBase*0).
+	m := uint64(val.n[0]) * uint64(val2.n[0])
+	t0 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*1).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[1]) +
+		uint64(val.n[1])*uint64(val2.n[0])
+	t1 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*2).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[2]) +
+		uint64(val.n[1])*uint64(val2.n[1]) +
+		uint64(val.n[2])*uint64(val2.n[0])
+	t2 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*3).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[3]) +
+		uint64(val.n[1])*uint64(val2.n[2]) +
+		uint64(val.n[2])*uint64(val2.n[1]) +
+		uint64(val.n[3])*uint64(val2.n[0])
+	t3 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*4).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[4]) +
+		uint64(val.n[1])*uint64(val2.n[3]) +
+		uint64(val.n[2])*uint64(val2.n[2]) +
+		uint64(val.n[3])*uint64(val2.n[1]) +
+		uint64(val.n[4])*uint64(val2.n[0])
+	t4 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*5).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[5]) +
+		uint64(val.n[1])*uint64(val2.n[4]) +
+		uint64(val.n[2])*uint64(val2.n[3]) +
+		uint64(val.n[3])*uint64(val2.n[2]) +
+		uint64(val.n[4])*uint64(val2.n[1]) +
+		uint64(val.n[5])*uint64(val2.n[0])
+	t5 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*6).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[6]) +
+		uint64(val.n[1])*uint64(val2.n[5]) +
+		uint64(val.n[2])*uint64(val2.n[4]) +
+		uint64(val.n[3])*uint64(val2.n[3]) +
+		uint64(val.n[4])*uint64(val2.n[2]) +
+		uint64(val.n[5])*uint64(val2.n[1]) +
+		uint64(val.n[6])*uint64(val2.n[0])
+	t6 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*7).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[7]) +
+		uint64(val.n[1])*uint64(val2.n[6]) +
+		uint64(val.n[2])*uint64(val2.n[5]) +
+		uint64(val.n[3])*uint64(val2.n[4]) +
+		uint64(val.n[4])*uint64(val2.n[3]) +
+		uint64(val.n[5])*uint64(val2.n[2]) +
+		uint64(val.n[6])*uint64(val2.n[1]) +
+		uint64(val.n[7])*uint64(val2.n[0])
+	t7 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*8).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[8]) +
+		uint64(val.n[1])*uint64(val2.n[7]) +
+		uint64(val.n[2])*uint64(val2.n[6]) +
+		uint64(val.n[3])*uint64(val2.n[5]) +
+		uint64(val.n[4])*uint64(val2.n[4]) +
+		uint64(val.n[5])*uint64(val2.n[3]) +
+		uint64(val.n[6])*uint64(val2.n[2]) +
+		uint64(val.n[7])*uint64(val2.n[1]) +
+		uint64(val.n[8])*uint64(val2.n[0])
+	t8 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*9).
+	m = (m >> fieldBase) +
+		uint64(val.n[0])*uint64(val2.n[9]) +
+		uint64(val.n[1])*uint64(val2.n[8]) +
+		uint64(val.n[2])*uint64(val2.n[7]) +
+		uint64(val.n[3])*uint64(val2.n[6]) +
+		uint64(val.n[4])*uint64(val2.n[5]) +
+		uint64(val.n[5])*uint64(val2.n[4]) +
+		uint64(val.n[6])*uint64(val2.n[3]) +
+		uint64(val.n[7])*uint64(val2.n[2]) +
+		uint64(val.n[8])*uint64(val2.n[1]) +
+		uint64(val.n[9])*uint64(val2.n[0])
+	t9 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*10).
+	m = (m >> fieldBase) +
+		uint64(val.n[1])*uint64(val2.n[9]) +
+		uint64(val.n[2])*uint64(val2.n[8]) +
+		uint64(val.n[3])*uint64(val2.n[7]) +
+		uint64(val.n[4])*uint64(val2.n[6]) +
+		uint64(val.n[5])*uint64(val2.n[5]) +
+		uint64(val.n[6])*uint64(val2.n[4]) +
+		uint64(val.n[7])*uint64(val2.n[3]) +
+		uint64(val.n[8])*uint64(val2.n[2]) +
+		uint64(val.n[9])*uint64(val2.n[1])
+	t10 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*11).
+	m = (m >> fieldBase) +
+		uint64(val.n[2])*uint64(val2.n[9]) +
+		uint64(val.n[3])*uint64(val2.n[8]) +
+		uint64(val.n[4])*uint64(val2.n[7]) +
+		uint64(val.n[5])*uint64(val2.n[6]) +
+		uint64(val.n[6])*uint64(val2.n[5]) +
+		uint64(val.n[7])*uint64(val2.n[4]) +
+		uint64(val.n[8])*uint64(val2.n[3]) +
+		uint64(val.n[9])*uint64(val2.n[2])
+	t11 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*12).
+	m = (m >> fieldBase) +
+		uint64(val.n[3])*uint64(val2.n[9]) +
+		uint64(val.n[4])*uint64(val2.n[8]) +
+		uint64(val.n[5])*uint64(val2.n[7]) +
+		uint64(val.n[6])*uint64(val2.n[6]) +
+		uint64(val.n[7])*uint64(val2.n[5]) +
+		uint64(val.n[8])*uint64(val2.n[4]) +
+		uint64(val.n[9])*uint64(val2.n[3])
+	t12 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*13).
+	m = (m >> fieldBase) +
+		uint64(val.n[4])*uint64(val2.n[9]) +
+		uint64(val.n[5])*uint64(val2.n[8]) +
+		uint64(val.n[6])*uint64(val2.n[7]) +
+		uint64(val.n[7])*uint64(val2.n[6]) +
+		uint64(val.n[8])*uint64(val2.n[5]) +
+		uint64(val.n[9])*uint64(val2.n[4])
+	t13 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*14).
+	m = (m >> fieldBase) +
+		uint64(val.n[5])*uint64(val2.n[9]) +
+		uint64(val.n[6])*uint64(val2.n[8]) +
+		uint64(val.n[7])*uint64(val2.n[7]) +
+		uint64(val.n[8])*uint64(val2.n[6]) +
+		uint64(val.n[9])*uint64(val2.n[5])
+	t14 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*15).
+	m = (m >> fieldBase) +
+		uint64(val.n[6])*uint64(val2.n[9]) +
+		uint64(val.n[7])*uint64(val2.n[8]) +
+		uint64(val.n[8])*uint64(val2.n[7]) +
+		uint64(val.n[9])*uint64(val2.n[6])
+	t15 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*16).
+	m = (m >> fieldBase) +
+		uint64(val.n[7])*uint64(val2.n[9]) +
+		uint64(val.n[8])*uint64(val2.n[8]) +
+		uint64(val.n[9])*uint64(val2.n[7])
+	t16 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*17).
+	m = (m >> fieldBase) +
+		uint64(val.n[8])*uint64(val2.n[9]) +
+		uint64(val.n[9])*uint64(val2.n[8])
+	t17 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*18).
+	m = (m >> fieldBase) + uint64(val.n[9])*uint64(val2.n[9])
+	t18 := m & fieldBaseMask
+
+	// What's left is for 2^(fieldBase*19).
+	t19 := m >> fieldBase
+
+	// At this point, all of the terms are grouped into their respective
+	// base.
+	//
+	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
+	// when the modulus is of the special form m = b^t - c, highly efficient
+	// reduction can be achieved per the provided algorithm.
+	//
+	// The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
+	// this criteria.
+	//
+	// 4294968273 in field representation (base 2^26) is:
+	// n[0] = 977
+	// n[1] = 64
+	// That is to say (2^26 * 64) + 977 = 4294968273
+	//
+	// Since each word is in base 26, the upper terms (t10 and up) start
+	// at 260 bits (versus the final desired range of 256 bits), so the
+	// field representation of 'c' from above needs to be adjusted for the
+	// extra 4 bits by multiplying it by 2^4 = 16.  4294968273 * 16 =
+	// 68719492368.  Thus, the adjusted field representation of 'c' is:
+	// n[0] = 977 * 16 = 15632
+	// n[1] = 64 * 16 = 1024
+	// That is to say (2^26 * 1024) + 15632 = 68719492368
+	//
+	// To reduce the final term, t19, the entire 'c' value is needed instead
+	// of only n[0] because there are no more terms left to handle n[1].
+	// This means there might be some magnitude left in the upper bits that
+	// is handled below.
+	m = t0 + t10*15632
+	t0 = m & fieldBaseMask
+	m = (m >> fieldBase) + t1 + t10*1024 + t11*15632
+	t1 = m & fieldBaseMask
+	m = (m >> fieldBase) + t2 + t11*1024 + t12*15632
+	t2 = m & fieldBaseMask
+	m = (m >> fieldBase) + t3 + t12*1024 + t13*15632
+	t3 = m & fieldBaseMask
+	m = (m >> fieldBase) + t4 + t13*1024 + t14*15632
+	t4 = m & fieldBaseMask
+	m = (m >> fieldBase) + t5 + t14*1024 + t15*15632
+	t5 = m & fieldBaseMask
+	m = (m >> fieldBase) + t6 + t15*1024 + t16*15632
+	t6 = m & fieldBaseMask
+	m = (m >> fieldBase) + t7 + t16*1024 + t17*15632
+	t7 = m & fieldBaseMask
+	m = (m >> fieldBase) + t8 + t17*1024 + t18*15632
+	t8 = m & fieldBaseMask
+	m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368
+	t9 = m & fieldMSBMask
+	m = m >> fieldMSBBits
+
+	// At this point, if the magnitude is greater than 0, the overall value
+	// is greater than the max possible 256-bit value.  In particular, it is
+	// "how many times larger" than the max value it is.
+	//
+	// The algorithm presented in [HAC] section 14.3.4 repeats until the
+	// quotient is zero.  However, due to the above, we already know at
+	// least how many times we would need to repeat as it's the value
+	// currently in m.  Thus we can simply multiply the magnitude by the
+	// field representation of the prime and do a single iteration.  Notice
+	// that nothing will be changed when the magnitude is zero, so we could
+	// skip this in that case, however always running regardless allows it
+	// to run in constant time.  The final result will be in the range
+	// 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a
+	// magnitude of 1, but it is denormalized.
+	d := t0 + m*977
+	f.n[0] = uint32(d & fieldBaseMask)
+	d = (d >> fieldBase) + t1 + m*64
+	f.n[1] = uint32(d & fieldBaseMask)
+	f.n[2] = uint32((d >> fieldBase) + t2)
+	f.n[3] = uint32(t3)
+	f.n[4] = uint32(t4)
+	f.n[5] = uint32(t5)
+	f.n[6] = uint32(t6)
+	f.n[7] = uint32(t7)
+	f.n[8] = uint32(t8)
+	f.n[9] = uint32(t9)
+
+	return f
+}
+
+// Square squares the field value.  The existing field value is modified.  Note
+// that this function can overflow if multiplying any of the individual words
+// exceeds a max uint32.  In practice, this means the magnitude of the field
+// must be a max of 8 to prevent overflow.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Square().Mul(f2) so that f = f^2 * f2.
+func (f *fieldVal) Square() *fieldVal {
+	return f.SquareVal(f)
+}
+
+// SquareVal squares the passed value and stores the result in f.  Note that
+// this function can overflow if multiplying any of the individual words
+// exceeds a max uint32.  In practice, this means the magnitude of the field
+// being squred must be a max of 8 to prevent overflow.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f3.SquareVal(f).Mul(f) so that f3 = f^2 * f = f^3.
+func (f *fieldVal) SquareVal(val *fieldVal) *fieldVal {
+	// This could be done with a couple of for loops and an array to store
+	// the intermediate terms, but this unrolled version is significantly
+	// faster.
+
+	// Terms for 2^(fieldBase*0).
+	m := uint64(val.n[0]) * uint64(val.n[0])
+	t0 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*1).
+	m = (m >> fieldBase) + 2*uint64(val.n[0])*uint64(val.n[1])
+	t1 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*2).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[2]) +
+		uint64(val.n[1])*uint64(val.n[1])
+	t2 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*3).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[3]) +
+		2*uint64(val.n[1])*uint64(val.n[2])
+	t3 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*4).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[4]) +
+		2*uint64(val.n[1])*uint64(val.n[3]) +
+		uint64(val.n[2])*uint64(val.n[2])
+	t4 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*5).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[5]) +
+		2*uint64(val.n[1])*uint64(val.n[4]) +
+		2*uint64(val.n[2])*uint64(val.n[3])
+	t5 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*6).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[6]) +
+		2*uint64(val.n[1])*uint64(val.n[5]) +
+		2*uint64(val.n[2])*uint64(val.n[4]) +
+		uint64(val.n[3])*uint64(val.n[3])
+	t6 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*7).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[7]) +
+		2*uint64(val.n[1])*uint64(val.n[6]) +
+		2*uint64(val.n[2])*uint64(val.n[5]) +
+		2*uint64(val.n[3])*uint64(val.n[4])
+	t7 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*8).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[8]) +
+		2*uint64(val.n[1])*uint64(val.n[7]) +
+		2*uint64(val.n[2])*uint64(val.n[6]) +
+		2*uint64(val.n[3])*uint64(val.n[5]) +
+		uint64(val.n[4])*uint64(val.n[4])
+	t8 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*9).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[0])*uint64(val.n[9]) +
+		2*uint64(val.n[1])*uint64(val.n[8]) +
+		2*uint64(val.n[2])*uint64(val.n[7]) +
+		2*uint64(val.n[3])*uint64(val.n[6]) +
+		2*uint64(val.n[4])*uint64(val.n[5])
+	t9 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*10).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[1])*uint64(val.n[9]) +
+		2*uint64(val.n[2])*uint64(val.n[8]) +
+		2*uint64(val.n[3])*uint64(val.n[7]) +
+		2*uint64(val.n[4])*uint64(val.n[6]) +
+		uint64(val.n[5])*uint64(val.n[5])
+	t10 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*11).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[2])*uint64(val.n[9]) +
+		2*uint64(val.n[3])*uint64(val.n[8]) +
+		2*uint64(val.n[4])*uint64(val.n[7]) +
+		2*uint64(val.n[5])*uint64(val.n[6])
+	t11 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*12).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[3])*uint64(val.n[9]) +
+		2*uint64(val.n[4])*uint64(val.n[8]) +
+		2*uint64(val.n[5])*uint64(val.n[7]) +
+		uint64(val.n[6])*uint64(val.n[6])
+	t12 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*13).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[4])*uint64(val.n[9]) +
+		2*uint64(val.n[5])*uint64(val.n[8]) +
+		2*uint64(val.n[6])*uint64(val.n[7])
+	t13 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*14).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[5])*uint64(val.n[9]) +
+		2*uint64(val.n[6])*uint64(val.n[8]) +
+		uint64(val.n[7])*uint64(val.n[7])
+	t14 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*15).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[6])*uint64(val.n[9]) +
+		2*uint64(val.n[7])*uint64(val.n[8])
+	t15 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*16).
+	m = (m >> fieldBase) +
+		2*uint64(val.n[7])*uint64(val.n[9]) +
+		uint64(val.n[8])*uint64(val.n[8])
+	t16 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*17).
+	m = (m >> fieldBase) + 2*uint64(val.n[8])*uint64(val.n[9])
+	t17 := m & fieldBaseMask
+
+	// Terms for 2^(fieldBase*18).
+	m = (m >> fieldBase) + uint64(val.n[9])*uint64(val.n[9])
+	t18 := m & fieldBaseMask
+
+	// What's left is for 2^(fieldBase*19).
+	t19 := m >> fieldBase
+
+	// At this point, all of the terms are grouped into their respective
+	// base.
+	//
+	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
+	// when the modulus is of the special form m = b^t - c, highly efficient
+	// reduction can be achieved per the provided algorithm.
+	//
+	// The secp256k1 prime is equivalent to 2^256 - 4294968273, so it fits
+	// this criteria.
+	//
+	// 4294968273 in field representation (base 2^26) is:
+	// n[0] = 977
+	// n[1] = 64
+	// That is to say (2^26 * 64) + 977 = 4294968273
+	//
+	// Since each word is in base 26, the upper terms (t10 and up) start
+	// at 260 bits (versus the final desired range of 256 bits), so the
+	// field representation of 'c' from above needs to be adjusted for the
+	// extra 4 bits by multiplying it by 2^4 = 16.  4294968273 * 16 =
+	// 68719492368.  Thus, the adjusted field representation of 'c' is:
+	// n[0] = 977 * 16 = 15632
+	// n[1] = 64 * 16 = 1024
+	// That is to say (2^26 * 1024) + 15632 = 68719492368
+	//
+	// To reduce the final term, t19, the entire 'c' value is needed instead
+	// of only n[0] because there are no more terms left to handle n[1].
+	// This means there might be some magnitude left in the upper bits that
+	// is handled below.
+	m = t0 + t10*15632
+	t0 = m & fieldBaseMask
+	m = (m >> fieldBase) + t1 + t10*1024 + t11*15632
+	t1 = m & fieldBaseMask
+	m = (m >> fieldBase) + t2 + t11*1024 + t12*15632
+	t2 = m & fieldBaseMask
+	m = (m >> fieldBase) + t3 + t12*1024 + t13*15632
+	t3 = m & fieldBaseMask
+	m = (m >> fieldBase) + t4 + t13*1024 + t14*15632
+	t4 = m & fieldBaseMask
+	m = (m >> fieldBase) + t5 + t14*1024 + t15*15632
+	t5 = m & fieldBaseMask
+	m = (m >> fieldBase) + t6 + t15*1024 + t16*15632
+	t6 = m & fieldBaseMask
+	m = (m >> fieldBase) + t7 + t16*1024 + t17*15632
+	t7 = m & fieldBaseMask
+	m = (m >> fieldBase) + t8 + t17*1024 + t18*15632
+	t8 = m & fieldBaseMask
+	m = (m >> fieldBase) + t9 + t18*1024 + t19*68719492368
+	t9 = m & fieldMSBMask
+	m = m >> fieldMSBBits
+
+	// At this point, if the magnitude is greater than 0, the overall value
+	// is greater than the max possible 256-bit value.  In particular, it is
+	// "how many times larger" than the max value it is.
+	//
+	// The algorithm presented in [HAC] section 14.3.4 repeats until the
+	// quotient is zero.  However, due to the above, we already know at
+	// least how many times we would need to repeat as it's the value
+	// currently in m.  Thus we can simply multiply the magnitude by the
+	// field representation of the prime and do a single iteration.  Notice
+	// that nothing will be changed when the magnitude is zero, so we could
+	// skip this in that case, however always running regardless allows it
+	// to run in constant time.  The final result will be in the range
+	// 0 <= result <= prime + (2^64 - c), so it is guaranteed to have a
+	// magnitude of 1, but it is denormalized.
+	n := t0 + m*977
+	f.n[0] = uint32(n & fieldBaseMask)
+	n = (n >> fieldBase) + t1 + m*64
+	f.n[1] = uint32(n & fieldBaseMask)
+	f.n[2] = uint32((n >> fieldBase) + t2)
+	f.n[3] = uint32(t3)
+	f.n[4] = uint32(t4)
+	f.n[5] = uint32(t5)
+	f.n[6] = uint32(t6)
+	f.n[7] = uint32(t7)
+	f.n[8] = uint32(t8)
+	f.n[9] = uint32(t9)
+
+	return f
+}
+
+// Inverse finds the modular multiplicative inverse of the field value.  The
+// existing field value is modified.
+//
+// The field value is returned to support chaining.  This enables syntax like:
+// f.Inverse().Mul(f2) so that f = f^-1 * f2.
+func (f *fieldVal) Inverse() *fieldVal {
+	// Fermat's little theorem states that for a nonzero number a and prime
+	// prime p, a^(p-1) = 1 (mod p).  Since the multipliciative inverse is
+	// a*b = 1 (mod p), it follows that b = a*a^(p-2) = a^(p-1) = 1 (mod p).
+	// Thus, a^(p-2) is the multiplicative inverse.
+	//
+	// In order to efficiently compute a^(p-2), p-2 needs to be split into
+	// a sequence of squares and multipications that minimizes the number of
+	// multiplications needed (since they are more costly than squarings).
+	// Intermediate results are saved and reused as well.
+	//
+	// The secp256k1 prime - 2 is 2^256 - 4294968275.
+	//
+	// This has a cost of 258 field squarings and 33 field multiplications.
+	var a2, a3, a4, a10, a11, a21, a42, a45, a63, a1019, a1023 fieldVal
+	a2.SquareVal(f)
+	a3.Mul2(&a2, f)
+	a4.SquareVal(&a2)
+	a10.SquareVal(&a4).Mul(&a2)
+	a11.Mul2(&a10, f)
+	a21.Mul2(&a10, &a11)
+	a42.SquareVal(&a21)
+	a45.Mul2(&a42, &a3)
+	a63.Mul2(&a42, &a21)
+	a1019.SquareVal(&a63).Square().Square().Square().Mul(&a11)
+	a1023.Mul2(&a1019, &a4)
+	f.Set(&a63)                                    // f = a^(2^6 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^11 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^16 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^16 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^21 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^26 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^26 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^31 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^36 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^36 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^41 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^46 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^46 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^51 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^56 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^56 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^61 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^66 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^66 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^71 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^76 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^76 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^81 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^86 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^86 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^91 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^96 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^96 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^101 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^106 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^106 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^111 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^116 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^116 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^121 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^126 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^126 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^131 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^136 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^136 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^141 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^146 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^146 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^151 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^156 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^156 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^161 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^166 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^166 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^171 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^176 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^176 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^181 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^186 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^186 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^191 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^196 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^196 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^201 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^206 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^206 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^211 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^216 - 1024)
+	f.Mul(&a1023)                                  // f = a^(2^216 - 1)
+	f.Square().Square().Square().Square().Square() // f = a^(2^221 - 32)
+	f.Square().Square().Square().Square().Square() // f = a^(2^226 - 1024)
+	f.Mul(&a1019)                                  // f = a^(2^226 - 5)
+	f.Square().Square().Square().Square().Square() // f = a^(2^231 - 160)
+	f.Square().Square().Square().Square().Square() // f = a^(2^236 - 5120)
+	f.Mul(&a1023)                                  // f = a^(2^236 - 4097)
+	f.Square().Square().Square().Square().Square() // f = a^(2^241 - 131104)
+	f.Square().Square().Square().Square().Square() // f = a^(2^246 - 4195328)
+	f.Mul(&a1023)                                  // f = a^(2^246 - 4194305)
+	f.Square().Square().Square().Square().Square() // f = a^(2^251 - 134217760)
+	f.Square().Square().Square().Square().Square() // f = a^(2^256 - 4294968320)
+	return f.Mul(&a45)                             // f = a^(2^256 - 4294968275) = a^(p-2)
+}
+
+// NewFieldVal returns a new field value set to 0.  Callers of this package
+// don't need to work with field values directly.  This is provided for testing
+// purposes.
+func NewFieldVal() *fieldVal {
+	return new(fieldVal)
+}