diff --git a/core/crypto/_fiat/field_p256r1/field.odin b/core/crypto/_fiat/field_p256r1/field.odin new file mode 100644 index 00000000000..6ecfea8f8d6 --- /dev/null +++ b/core/crypto/_fiat/field_p256r1/field.odin @@ -0,0 +1,345 @@ +package field_p256r1 + +import "core:encoding/endian" +import "core:math/bits" +import "core:mem" + +fe_clear :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) { + mem.zero_explicit(arg1, size_of(Montgomery_Domain_Field_Element)) +} + +fe_clear_vec :: proc "contextless" ( + arg1: []^Montgomery_Domain_Field_Element, +) { + for fe in arg1 { + fe_clear(fe) + } +} + +fe_from_bytes :: proc "contextless" ( + out1: ^Montgomery_Domain_Field_Element, + arg1: []byte, + unsafe_assume_canonical := false, +) -> bool { + ensure_contextless(len(arg1) == 32, "p256r1: invalid fe input buffer") + + // Note: We assume the input is in big-endian. + tmp := Non_Montgomery_Domain_Field_Element { + endian.unchecked_get_u64le(arg1[24:]), + endian.unchecked_get_u64le(arg1[16:]), + endian.unchecked_get_u64le(arg1[8:]), + endian.unchecked_get_u64le(arg1[0:]), + } + defer mem.zero_explicit(&tmp, size_of(tmp)) + + // Check that tmp is in the the range [0, ELL). + if !unsafe_assume_canonical { + _, borrow := bits.sub_u64(ELL[0] - 1, tmp[0], 0) + _, borrow = bits.sub_u64(ELL[1], tmp[1], borrow) + _, borrow = bits.sub_u64(ELL[2], tmp[2], borrow) + _, borrow = bits.sub_u64(ELL[3], tmp[3], borrow) + if borrow != 0 { + return false + } + } + + fe_to_montgomery(out1, &tmp) + + return true +} + +fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) { + ensure_contextless(len(out1) == 32, "p256r1: invalid fe output buffer") + + tmp: Non_Montgomery_Domain_Field_Element + fe_from_montgomery(&tmp, arg1) + + // Note: Likewise, output in big-endian. + endian.unchecked_put_u64le(out1[24:], tmp[0]) + endian.unchecked_put_u64le(out1[16:], tmp[1]) + endian.unchecked_put_u64le(out1[8:], tmp[2]) + endian.unchecked_put_u64le(out1[0:], tmp[3]) + + mem.zero_explicit(&tmp, size_of(tmp)) +} + +@(require_results) +fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> int { + tmp: Montgomery_Domain_Field_Element + fe_sub(&tmp, arg1, arg2) + + // This will only underflow iff arg1 == arg2, and we return the borrow, + // which will be 1. + _, borrow := bits.sub_u64(fe_non_zero(&tmp), 1, 0) + + fe_clear(&tmp) + + return int(borrow) +} + +@(require_results) +fe_is_odd :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> int { + tmp: Non_Montgomery_Domain_Field_Element + defer mem.zero_explicit(&tmp, size_of(tmp)) + + fe_from_montgomery(&tmp, arg1) + return int(tmp[0] & 1) +} + +fe_pow2k :: proc "contextless" ( + out1: ^Montgomery_Domain_Field_Element, + arg1: ^Montgomery_Domain_Field_Element, + arg2: uint, +) { + // Special case: `arg1^(2 * 0) = 1`, though this should never happen. + if arg2 == 0 { + fe_one(out1) + return + } + + fe_square(out1, arg1) + for _ in 1 ..< arg2 { + fe_square(out1, out1) + } +} + +fe_inv :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + // Inversion computation is derived from the addition chain: + // + // _10 = 2*1 + // _11 = 1 + _10 + // _110 = 2*_11 + // _111 = 1 + _110 + // _111000 = _111 << 3 + // _111111 = _111 + _111000 + // x12 = _111111 << 6 + _111111 + // x15 = x12 << 3 + _111 + // x16 = 2*x15 + 1 + // x32 = x16 << 16 + x16 + // i53 = x32 << 15 + // x47 = x15 + i53 + // i263 = ((i53 << 17 + 1) << 143 + x47) << 47 + // return (x47 + i263) << 2 + // + // Operations: 255 squares 11 multiplies + // + // Generated by github.com/mmcloughlin/addchain v0.4.0. + + // Note: Need to stash `arg1` (`xx`) in the case that `out1`/`arg1` alias, + // as `arg1` is used after `out1` has been altered. + t0, t1, xx: Montgomery_Domain_Field_Element = ---, ---, arg1^ + + // Step 1: z = x^0x2 + fe_square(out1, arg1) + + // Step 2: z = x^0x3 + fe_mul(out1, &xx, out1) + + // Step 3: z = x^0x6 + fe_square(out1, out1) + + // Step 4: z = x^0x7 + fe_mul(out1, &xx, out1) + + // Step 7: t0 = x^0x38 + fe_pow2k(&t0, out1, 3) + + // Step 8: t0 = x^0x3f + fe_mul(&t0, out1, &t0) + + // Step 14: t1 = x^0xfc0 + fe_pow2k(&t1, &t0, 6) + + // Step 15: t0 = x^0xfff + fe_mul(&t0, &t0, &t1) + + // Step 18: t0 = x^0x7ff8 + fe_pow2k(&t0, &t0, 3) + + // Step 19: z = x^0x7fff + fe_mul(out1, out1, &t0) + + // Step 20: t0 = x^0xfffe + fe_square(&t0, out1) + + // Step 21: t0 = x^0xffff + fe_mul(&t0, &xx, &t0) + + // Step 37: t1 = x^0xffff0000 + fe_pow2k(&t1, &t0, 16) + + // Step 38: t0 = x^0xffffffff + fe_mul(&t0, &t0, &t1) + + // Step 53: t0 = x^0x7fffffff8000 + fe_pow2k(&t0, &t0, 15) + + // Step 54: z = x^0x7fffffffffff + fe_mul(out1, out1, &t0) + + // Step 71: t0 = x^0xffffffff00000000 + fe_pow2k(&t0, &t0, 17) + + // Step 72: t0 = x^0xffffffff00000001 + fe_mul(&t0, &xx, &t0) + + // Step 215: t0 = x^0x7fffffff80000000800000000000000000000000000000000000 + fe_pow2k(&t0, &t0, 143) + + // Step 216: t0 = x^0x7fffffff800000008000000000000000000000007fffffffffff + fe_mul(&t0, out1, &t0) + + // Step 263: t0 = x^0x3fffffffc00000004000000000000000000000003fffffffffff800000000000 + fe_pow2k(&t0, &t0, 47) + + // Step 264: z = x^0x3fffffffc00000004000000000000000000000003fffffffffffffffffffffff + fe_mul(out1, out1, &t0) + + // Step 266: z = x^0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc + fe_pow2k(out1, out1, 2) + + fe_mul(out1, out1, &xx) + + fe_clear_vec([]^Montgomery_Domain_Field_Element{&t0, &t1, &xx}) +} + +@(require_results) +fe_sqrt :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) -> int { + // Square root candidate can be derived via exponentiation by `(p + 1) / 4` + // From sage: 28948022302589062190674361737351893382521535853822578548883407827216774463488 + // + // // Inversion computation is derived from the addition chain: + // + // _10 = 2*1 + // _11 = 1 + _10 + // _1100 = _11 << 2 + // _1111 = _11 + _1100 + // _11110000 = _1111 << 4 + // _11111111 = _1111 + _11110000 + // x16 = _11111111 << 8 + _11111111 + // x32 = x16 << 16 + x16 + // return ((x32 << 32 + 1) << 96 + 1) << 94 + // + // Operations: 253 squares 7 multiplies + // + // Generated by github.com/mmcloughlin/addchain v0.4.0. + + // Likewise this tramples over arg1, so stash another copy. + t0, xx: Montgomery_Domain_Field_Element = ---, arg1^ + + // Step 1: z = x^0x2 + fe_square(out1, arg1) + + // Step 2: z = x^0x3 + fe_mul(out1, &xx, out1) + + // Step 4: t0 = x^0xc + fe_pow2k(&t0, &xx, 2) + + // Step 5: z = x^0xf + fe_mul(out1, out1, &t0) + + // Step 9: t0 = x^0xf0 + fe_pow2k(&t0, out1, 4) + + // Step 10: z = x^0xff + fe_mul(out1, out1, &t0) + + // Step 18: t0 = x^0xff00 + fe_pow2k(&t0, out1, 8) + + // Step 19: z = x^0xffff + fe_mul(out1, out1, &t0) + + // Step 35: t0 = x^0xffff0000 + fe_pow2k(&t0, out1, 16) + + // Step 36: z = x^0xffffffff + fe_mul(out1, out1, &t0) + + // Step 68: z = x^0xffffffff00000000 + fe_pow2k(out1, out1, 32) + + // Step 69: z = x^0xffffffff00000001 + fe_mul(out1, &xx, out1) + + // Step 165: z = x^0xffffffff00000001000000000000000000000000 + fe_pow2k(out1, out1, 96) + + // Step 166: z = x^0xffffffff00000001000000000000000000000001 + fe_mul(out1, &xx, out1) + + // Step 260: z = x^0x3fffffffc0000000400000000000000000000000400000000000000000000000 + fe_pow2k(out1, out1, 94) + + // Ensure that our candidate is actually the square root. + check, zero: Montgomery_Domain_Field_Element + fe_square(&check, out1) + + is_valid := fe_equal(&check, &xx) + fe_cond_select(out1, &zero, out1, is_valid) + + fe_clear_vec([]^Montgomery_Domain_Field_Element{&t0, &xx, &check}) + + return is_valid + +} + +fe_zero :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) { + out1[0] = 0 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 +} + +fe_set :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 +} + +@(optimization_mode = "none") +fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Montgomery_Domain_Field_Element, arg1: int) { + mask := (u64(arg1) * 0xffffffffffffffff) + x := (out1[0] ~ out2[0]) & mask + x1, y1 := out1[0] ~ x, out2[0] ~ x + x = (out1[1] ~ out2[1]) & mask + x2, y2 := out1[1] ~ x, out2[1] ~ x + x = (out1[2] ~ out2[2]) & mask + x3, y3 := out1[2] ~ x, out2[2] ~ x + x = (out1[3] ~ out2[3]) & mask + x4, y4 := out1[3] ~ x, out2[3] ~ x + out1[0], out2[0] = x1, y1 + out1[1], out2[1] = x2, y2 + out1[2], out2[2] = x3, y3 + out1[3], out2[3] = x4, y4 +} + +@(optimization_mode = "none") +fe_cond_select :: #force_no_inline proc "contextless" ( + out1, arg1, arg2: ^Montgomery_Domain_Field_Element, + arg3: int, +) { + mask := (u64(arg3) * 0xffffffffffffffff) + x1 := ((mask & arg2[0]) | ((~mask) & arg1[0])) + x2 := ((mask & arg2[1]) | ((~mask) & arg1[1])) + x3 := ((mask & arg2[2]) | ((~mask) & arg1[2])) + x4 := ((mask & arg2[3]) | ((~mask) & arg1[3])) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 +} + +fe_cond_negate :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element, ctrl: int) { + tmp1: Montgomery_Domain_Field_Element = --- + fe_opp(&tmp1, arg1) + fe_cond_select(out1, arg1, &tmp1, ctrl) + + fe_clear(&tmp1) +} diff --git a/core/crypto/_fiat/field_p256r1/field64.odin b/core/crypto/_fiat/field_p256r1/field64.odin new file mode 100644 index 00000000000..940f2cd0756 --- /dev/null +++ b/core/crypto/_fiat/field_p256r1/field64.odin @@ -0,0 +1,501 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package field_p256r1 + +// The file provides arithmetic on the field Z/(2^256 - 2^224 + 2^192 + 2^96 - 1) +// using a 64-bit Montgomery form internal representation. It is derived +// primarily from the machine generated Golang output from the fiat-crypto +// project. +// +// While the base implementation is provably correct, this implementation +// makes no such claims as the port and optimizations were done by hand. +// +// WARNING: While big-endian is the common representation used for this +// curve, the fiat output uses least-significant-limb first. + +import fiat "core:crypto/_fiat" +import "core:math/bits" + +// ELL is the saturated representation of the field order, least-significant +// limb first. +ELL :: [4]u64{0xffffffffffffffff, 0xffffffff, 0x0, 0xffffffff00000001} + +Montgomery_Domain_Field_Element :: distinct [4]u64 +Non_Montgomery_Domain_Field_Element :: distinct [4]u64 + +fe_mul :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) { + x1 := arg1[1] + x2 := arg1[2] + x3 := arg1[3] + x4 := arg1[0] + x6, x5 := bits.mul_u64(x4, arg2[3]) + x8, x7 := bits.mul_u64(x4, arg2[2]) + x10, x9 := bits.mul_u64(x4, arg2[1]) + x12, x11 := bits.mul_u64(x4, arg2[0]) + x13, x14 := bits.add_u64(x12, x9, u64(0x0)) + x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14))) + x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16))) + x19 := (u64(fiat.u1(x18)) + x6) + x21, x20 := bits.mul_u64(x11, 0xffffffff00000001) + x23, x22 := bits.mul_u64(x11, 0xffffffff) + x25, x24 := bits.mul_u64(x11, 0xffffffffffffffff) + x26, x27 := bits.add_u64(x25, x22, u64(0x0)) + x28 := (u64(fiat.u1(x27)) + x23) + _, x30 := bits.add_u64(x11, x24, u64(0x0)) + x31, x32 := bits.add_u64(x13, x26, u64(fiat.u1(x30))) + x33, x34 := bits.add_u64(x15, x28, u64(fiat.u1(x32))) + x35, x36 := bits.add_u64(x17, x20, u64(fiat.u1(x34))) + x37, x38 := bits.add_u64(x19, x21, u64(fiat.u1(x36))) + x40, x39 := bits.mul_u64(x1, arg2[3]) + x42, x41 := bits.mul_u64(x1, arg2[2]) + x44, x43 := bits.mul_u64(x1, arg2[1]) + x46, x45 := bits.mul_u64(x1, arg2[0]) + x47, x48 := bits.add_u64(x46, x43, u64(0x0)) + x49, x50 := bits.add_u64(x44, x41, u64(fiat.u1(x48))) + x51, x52 := bits.add_u64(x42, x39, u64(fiat.u1(x50))) + x53 := (u64(fiat.u1(x52)) + x40) + x54, x55 := bits.add_u64(x31, x45, u64(0x0)) + x56, x57 := bits.add_u64(x33, x47, u64(fiat.u1(x55))) + x58, x59 := bits.add_u64(x35, x49, u64(fiat.u1(x57))) + x60, x61 := bits.add_u64(x37, x51, u64(fiat.u1(x59))) + x62, x63 := bits.add_u64(u64(fiat.u1(x38)), x53, u64(fiat.u1(x61))) + x65, x64 := bits.mul_u64(x54, 0xffffffff00000001) + x67, x66 := bits.mul_u64(x54, 0xffffffff) + x69, x68 := bits.mul_u64(x54, 0xffffffffffffffff) + x70, x71 := bits.add_u64(x69, x66, u64(0x0)) + x72 := (u64(fiat.u1(x71)) + x67) + _, x74 := bits.add_u64(x54, x68, u64(0x0)) + x75, x76 := bits.add_u64(x56, x70, u64(fiat.u1(x74))) + x77, x78 := bits.add_u64(x58, x72, u64(fiat.u1(x76))) + x79, x80 := bits.add_u64(x60, x64, u64(fiat.u1(x78))) + x81, x82 := bits.add_u64(x62, x65, u64(fiat.u1(x80))) + x83 := (u64(fiat.u1(x82)) + u64(fiat.u1(x63))) + x85, x84 := bits.mul_u64(x2, arg2[3]) + x87, x86 := bits.mul_u64(x2, arg2[2]) + x89, x88 := bits.mul_u64(x2, arg2[1]) + x91, x90 := bits.mul_u64(x2, arg2[0]) + x92, x93 := bits.add_u64(x91, x88, u64(0x0)) + x94, x95 := bits.add_u64(x89, x86, u64(fiat.u1(x93))) + x96, x97 := bits.add_u64(x87, x84, u64(fiat.u1(x95))) + x98 := (u64(fiat.u1(x97)) + x85) + x99, x100 := bits.add_u64(x75, x90, u64(0x0)) + x101, x102 := bits.add_u64(x77, x92, u64(fiat.u1(x100))) + x103, x104 := bits.add_u64(x79, x94, u64(fiat.u1(x102))) + x105, x106 := bits.add_u64(x81, x96, u64(fiat.u1(x104))) + x107, x108 := bits.add_u64(x83, x98, u64(fiat.u1(x106))) + x110, x109 := bits.mul_u64(x99, 0xffffffff00000001) + x112, x111 := bits.mul_u64(x99, 0xffffffff) + x114, x113 := bits.mul_u64(x99, 0xffffffffffffffff) + x115, x116 := bits.add_u64(x114, x111, u64(0x0)) + x117 := (u64(fiat.u1(x116)) + x112) + _, x119 := bits.add_u64(x99, x113, u64(0x0)) + x120, x121 := bits.add_u64(x101, x115, u64(fiat.u1(x119))) + x122, x123 := bits.add_u64(x103, x117, u64(fiat.u1(x121))) + x124, x125 := bits.add_u64(x105, x109, u64(fiat.u1(x123))) + x126, x127 := bits.add_u64(x107, x110, u64(fiat.u1(x125))) + x128 := (u64(fiat.u1(x127)) + u64(fiat.u1(x108))) + x130, x129 := bits.mul_u64(x3, arg2[3]) + x132, x131 := bits.mul_u64(x3, arg2[2]) + x134, x133 := bits.mul_u64(x3, arg2[1]) + x136, x135 := bits.mul_u64(x3, arg2[0]) + x137, x138 := bits.add_u64(x136, x133, u64(0x0)) + x139, x140 := bits.add_u64(x134, x131, u64(fiat.u1(x138))) + x141, x142 := bits.add_u64(x132, x129, u64(fiat.u1(x140))) + x143 := (u64(fiat.u1(x142)) + x130) + x144, x145 := bits.add_u64(x120, x135, u64(0x0)) + x146, x147 := bits.add_u64(x122, x137, u64(fiat.u1(x145))) + x148, x149 := bits.add_u64(x124, x139, u64(fiat.u1(x147))) + x150, x151 := bits.add_u64(x126, x141, u64(fiat.u1(x149))) + x152, x153 := bits.add_u64(x128, x143, u64(fiat.u1(x151))) + x155, x154 := bits.mul_u64(x144, 0xffffffff00000001) + x157, x156 := bits.mul_u64(x144, 0xffffffff) + x159, x158 := bits.mul_u64(x144, 0xffffffffffffffff) + x160, x161 := bits.add_u64(x159, x156, u64(0x0)) + x162 := (u64(fiat.u1(x161)) + x157) + _, x164 := bits.add_u64(x144, x158, u64(0x0)) + x165, x166 := bits.add_u64(x146, x160, u64(fiat.u1(x164))) + x167, x168 := bits.add_u64(x148, x162, u64(fiat.u1(x166))) + x169, x170 := bits.add_u64(x150, x154, u64(fiat.u1(x168))) + x171, x172 := bits.add_u64(x152, x155, u64(fiat.u1(x170))) + x173 := (u64(fiat.u1(x172)) + u64(fiat.u1(x153))) + x174, x175 := bits.sub_u64(x165, 0xffffffffffffffff, u64(0x0)) + x176, x177 := bits.sub_u64(x167, 0xffffffff, u64(fiat.u1(x175))) + x178, x179 := bits.sub_u64(x169, u64(0x0), u64(fiat.u1(x177))) + x180, x181 := bits.sub_u64(x171, 0xffffffff00000001, u64(fiat.u1(x179))) + _, x183 := bits.sub_u64(x173, u64(0x0), u64(fiat.u1(x181))) + x184 := fiat.cmovznz_u64(fiat.u1(x183), x174, x165) + x185 := fiat.cmovznz_u64(fiat.u1(x183), x176, x167) + x186 := fiat.cmovznz_u64(fiat.u1(x183), x178, x169) + x187 := fiat.cmovznz_u64(fiat.u1(x183), x180, x171) + out1[0] = x184 + out1[1] = x185 + out1[2] = x186 + out1[3] = x187 +} + +fe_square :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + x1 := arg1[1] + x2 := arg1[2] + x3 := arg1[3] + x4 := arg1[0] + x6, x5 := bits.mul_u64(x4, arg1[3]) + x8, x7 := bits.mul_u64(x4, arg1[2]) + x10, x9 := bits.mul_u64(x4, arg1[1]) + x12, x11 := bits.mul_u64(x4, arg1[0]) + x13, x14 := bits.add_u64(x12, x9, u64(0x0)) + x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14))) + x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16))) + x19 := (u64(fiat.u1(x18)) + x6) + x21, x20 := bits.mul_u64(x11, 0xffffffff00000001) + x23, x22 := bits.mul_u64(x11, 0xffffffff) + x25, x24 := bits.mul_u64(x11, 0xffffffffffffffff) + x26, x27 := bits.add_u64(x25, x22, u64(0x0)) + x28 := (u64(fiat.u1(x27)) + x23) + _, x30 := bits.add_u64(x11, x24, u64(0x0)) + x31, x32 := bits.add_u64(x13, x26, u64(fiat.u1(x30))) + x33, x34 := bits.add_u64(x15, x28, u64(fiat.u1(x32))) + x35, x36 := bits.add_u64(x17, x20, u64(fiat.u1(x34))) + x37, x38 := bits.add_u64(x19, x21, u64(fiat.u1(x36))) + x40, x39 := bits.mul_u64(x1, arg1[3]) + x42, x41 := bits.mul_u64(x1, arg1[2]) + x44, x43 := bits.mul_u64(x1, arg1[1]) + x46, x45 := bits.mul_u64(x1, arg1[0]) + x47, x48 := bits.add_u64(x46, x43, u64(0x0)) + x49, x50 := bits.add_u64(x44, x41, u64(fiat.u1(x48))) + x51, x52 := bits.add_u64(x42, x39, u64(fiat.u1(x50))) + x53 := (u64(fiat.u1(x52)) + x40) + x54, x55 := bits.add_u64(x31, x45, u64(0x0)) + x56, x57 := bits.add_u64(x33, x47, u64(fiat.u1(x55))) + x58, x59 := bits.add_u64(x35, x49, u64(fiat.u1(x57))) + x60, x61 := bits.add_u64(x37, x51, u64(fiat.u1(x59))) + x62, x63 := bits.add_u64(u64(fiat.u1(x38)), x53, u64(fiat.u1(x61))) + x65, x64 := bits.mul_u64(x54, 0xffffffff00000001) + x67, x66 := bits.mul_u64(x54, 0xffffffff) + x69, x68 := bits.mul_u64(x54, 0xffffffffffffffff) + x70, x71 := bits.add_u64(x69, x66, u64(0x0)) + x72 := (u64(fiat.u1(x71)) + x67) + _, x74 := bits.add_u64(x54, x68, u64(0x0)) + x75, x76 := bits.add_u64(x56, x70, u64(fiat.u1(x74))) + x77, x78 := bits.add_u64(x58, x72, u64(fiat.u1(x76))) + x79, x80 := bits.add_u64(x60, x64, u64(fiat.u1(x78))) + x81, x82 := bits.add_u64(x62, x65, u64(fiat.u1(x80))) + x83 := (u64(fiat.u1(x82)) + u64(fiat.u1(x63))) + x85, x84 := bits.mul_u64(x2, arg1[3]) + x87, x86 := bits.mul_u64(x2, arg1[2]) + x89, x88 := bits.mul_u64(x2, arg1[1]) + x91, x90 := bits.mul_u64(x2, arg1[0]) + x92, x93 := bits.add_u64(x91, x88, u64(0x0)) + x94, x95 := bits.add_u64(x89, x86, u64(fiat.u1(x93))) + x96, x97 := bits.add_u64(x87, x84, u64(fiat.u1(x95))) + x98 := (u64(fiat.u1(x97)) + x85) + x99, x100 := bits.add_u64(x75, x90, u64(0x0)) + x101, x102 := bits.add_u64(x77, x92, u64(fiat.u1(x100))) + x103, x104 := bits.add_u64(x79, x94, u64(fiat.u1(x102))) + x105, x106 := bits.add_u64(x81, x96, u64(fiat.u1(x104))) + x107, x108 := bits.add_u64(x83, x98, u64(fiat.u1(x106))) + x110, x109 := bits.mul_u64(x99, 0xffffffff00000001) + x112, x111 := bits.mul_u64(x99, 0xffffffff) + x114, x113 := bits.mul_u64(x99, 0xffffffffffffffff) + x115, x116 := bits.add_u64(x114, x111, u64(0x0)) + x117 := (u64(fiat.u1(x116)) + x112) + _, x119 := bits.add_u64(x99, x113, u64(0x0)) + x120, x121 := bits.add_u64(x101, x115, u64(fiat.u1(x119))) + x122, x123 := bits.add_u64(x103, x117, u64(fiat.u1(x121))) + x124, x125 := bits.add_u64(x105, x109, u64(fiat.u1(x123))) + x126, x127 := bits.add_u64(x107, x110, u64(fiat.u1(x125))) + x128 := (u64(fiat.u1(x127)) + u64(fiat.u1(x108))) + x130, x129 := bits.mul_u64(x3, arg1[3]) + x132, x131 := bits.mul_u64(x3, arg1[2]) + x134, x133 := bits.mul_u64(x3, arg1[1]) + x136, x135 := bits.mul_u64(x3, arg1[0]) + x137, x138 := bits.add_u64(x136, x133, u64(0x0)) + x139, x140 := bits.add_u64(x134, x131, u64(fiat.u1(x138))) + x141, x142 := bits.add_u64(x132, x129, u64(fiat.u1(x140))) + x143 := (u64(fiat.u1(x142)) + x130) + x144, x145 := bits.add_u64(x120, x135, u64(0x0)) + x146, x147 := bits.add_u64(x122, x137, u64(fiat.u1(x145))) + x148, x149 := bits.add_u64(x124, x139, u64(fiat.u1(x147))) + x150, x151 := bits.add_u64(x126, x141, u64(fiat.u1(x149))) + x152, x153 := bits.add_u64(x128, x143, u64(fiat.u1(x151))) + x155, x154 := bits.mul_u64(x144, 0xffffffff00000001) + x157, x156 := bits.mul_u64(x144, 0xffffffff) + x159, x158 := bits.mul_u64(x144, 0xffffffffffffffff) + x160, x161 := bits.add_u64(x159, x156, u64(0x0)) + x162 := (u64(fiat.u1(x161)) + x157) + _, x164 := bits.add_u64(x144, x158, u64(0x0)) + x165, x166 := bits.add_u64(x146, x160, u64(fiat.u1(x164))) + x167, x168 := bits.add_u64(x148, x162, u64(fiat.u1(x166))) + x169, x170 := bits.add_u64(x150, x154, u64(fiat.u1(x168))) + x171, x172 := bits.add_u64(x152, x155, u64(fiat.u1(x170))) + x173 := (u64(fiat.u1(x172)) + u64(fiat.u1(x153))) + x174, x175 := bits.sub_u64(x165, 0xffffffffffffffff, u64(0x0)) + x176, x177 := bits.sub_u64(x167, 0xffffffff, u64(fiat.u1(x175))) + x178, x179 := bits.sub_u64(x169, u64(0x0), u64(fiat.u1(x177))) + x180, x181 := bits.sub_u64(x171, 0xffffffff00000001, u64(fiat.u1(x179))) + _, x183 := bits.sub_u64(x173, u64(0x0), u64(fiat.u1(x181))) + x184 := fiat.cmovznz_u64(fiat.u1(x183), x174, x165) + x185 := fiat.cmovznz_u64(fiat.u1(x183), x176, x167) + x186 := fiat.cmovznz_u64(fiat.u1(x183), x178, x169) + x187 := fiat.cmovznz_u64(fiat.u1(x183), x180, x171) + out1[0] = x184 + out1[1] = x185 + out1[2] = x186 + out1[3] = x187 +} + +fe_add :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) { + x1, x2 := bits.add_u64(arg1[0], arg2[0], u64(0x0)) + x3, x4 := bits.add_u64(arg1[1], arg2[1], u64(fiat.u1(x2))) + x5, x6 := bits.add_u64(arg1[2], arg2[2], u64(fiat.u1(x4))) + x7, x8 := bits.add_u64(arg1[3], arg2[3], u64(fiat.u1(x6))) + x9, x10 := bits.sub_u64(x1, 0xffffffffffffffff, u64(0x0)) + x11, x12 := bits.sub_u64(x3, 0xffffffff, u64(fiat.u1(x10))) + x13, x14 := bits.sub_u64(x5, u64(0x0), u64(fiat.u1(x12))) + x15, x16 := bits.sub_u64(x7, 0xffffffff00000001, u64(fiat.u1(x14))) + _, x18 := bits.sub_u64(u64(fiat.u1(x8)), u64(0x0), u64(fiat.u1(x16))) + x19 := fiat.cmovznz_u64(fiat.u1(x18), x9, x1) + x20 := fiat.cmovznz_u64(fiat.u1(x18), x11, x3) + x21 := fiat.cmovznz_u64(fiat.u1(x18), x13, x5) + x22 := fiat.cmovznz_u64(fiat.u1(x18), x15, x7) + out1[0] = x19 + out1[1] = x20 + out1[2] = x21 + out1[3] = x22 +} + +fe_sub :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) { + x1, x2 := bits.sub_u64(arg1[0], arg2[0], u64(0x0)) + x3, x4 := bits.sub_u64(arg1[1], arg2[1], u64(fiat.u1(x2))) + x5, x6 := bits.sub_u64(arg1[2], arg2[2], u64(fiat.u1(x4))) + x7, x8 := bits.sub_u64(arg1[3], arg2[3], u64(fiat.u1(x6))) + x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff) + x10, x11 := bits.add_u64(x1, x9, u64(0x0)) + x12, x13 := bits.add_u64(x3, (x9 & 0xffffffff), u64(fiat.u1(x11))) + x14, x15 := bits.add_u64(x5, u64(0x0), u64(fiat.u1(x13))) + x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000001), u64(fiat.u1(x15))) + out1[0] = x10 + out1[1] = x12 + out1[2] = x14 + out1[3] = x16 +} + +fe_opp :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + x1, x2 := bits.sub_u64(u64(0x0), arg1[0], u64(0x0)) + x3, x4 := bits.sub_u64(u64(0x0), arg1[1], u64(fiat.u1(x2))) + x5, x6 := bits.sub_u64(u64(0x0), arg1[2], u64(fiat.u1(x4))) + x7, x8 := bits.sub_u64(u64(0x0), arg1[3], u64(fiat.u1(x6))) + x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff) + x10, x11 := bits.add_u64(x1, x9, u64(0x0)) + x12, x13 := bits.add_u64(x3, (x9 & 0xffffffff), u64(fiat.u1(x11))) + x14, x15 := bits.add_u64(x5, u64(0x0), u64(fiat.u1(x13))) + x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000001), u64(fiat.u1(x15))) + out1[0] = x10 + out1[1] = x12 + out1[2] = x14 + out1[3] = x16 +} + +fe_one :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) { + out1[0] = 0x1 + out1[1] = 0xffffffff00000000 + out1[2] = 0xffffffffffffffff + out1[3] = 0xfffffffe +} + +fe_non_zero :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> u64 { + return arg1[0] | (arg1[1] | (arg1[2] | arg1[3])) +} + +@(optimization_mode = "none") +fe_cond_assign :: #force_no_inline proc "contextless" ( + out1, arg1: ^Montgomery_Domain_Field_Element, + arg2: int, +) { + x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0]) + x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1]) + x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2]) + x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 +} + +fe_from_montgomery :: proc "contextless" ( + out1: ^Non_Montgomery_Domain_Field_Element, + arg1: ^Montgomery_Domain_Field_Element, +) { + x1 := arg1[0] + x3, x2 := bits.mul_u64(x1, 0xffffffff00000001) + x5, x4 := bits.mul_u64(x1, 0xffffffff) + x7, x6 := bits.mul_u64(x1, 0xffffffffffffffff) + x8, x9 := bits.add_u64(x7, x4, u64(0x0)) + _, x11 := bits.add_u64(x1, x6, u64(0x0)) + x12, x13 := bits.add_u64(u64(0x0), x8, u64(fiat.u1(x11))) + x14, x15 := bits.add_u64(x12, arg1[1], u64(0x0)) + x17, x16 := bits.mul_u64(x14, 0xffffffff00000001) + x19, x18 := bits.mul_u64(x14, 0xffffffff) + x21, x20 := bits.mul_u64(x14, 0xffffffffffffffff) + x22, x23 := bits.add_u64(x21, x18, u64(0x0)) + _, x25 := bits.add_u64(x14, x20, u64(0x0)) + x26, x27 := bits.add_u64((u64(fiat.u1(x15)) + (u64(fiat.u1(x13)) + (u64(fiat.u1(x9)) + x5))), x22, u64(fiat.u1(x25))) + x28, x29 := bits.add_u64(x2, (u64(fiat.u1(x23)) + x19), u64(fiat.u1(x27))) + x30, x31 := bits.add_u64(x3, x16, u64(fiat.u1(x29))) + x32, x33 := bits.add_u64(x26, arg1[2], u64(0x0)) + x34, x35 := bits.add_u64(x28, u64(0x0), u64(fiat.u1(x33))) + x36, x37 := bits.add_u64(x30, u64(0x0), u64(fiat.u1(x35))) + x39, x38 := bits.mul_u64(x32, 0xffffffff00000001) + x41, x40 := bits.mul_u64(x32, 0xffffffff) + x43, x42 := bits.mul_u64(x32, 0xffffffffffffffff) + x44, x45 := bits.add_u64(x43, x40, u64(0x0)) + _, x47 := bits.add_u64(x32, x42, u64(0x0)) + x48, x49 := bits.add_u64(x34, x44, u64(fiat.u1(x47))) + x50, x51 := bits.add_u64(x36, (u64(fiat.u1(x45)) + x41), u64(fiat.u1(x49))) + x52, x53 := bits.add_u64((u64(fiat.u1(x37)) + (u64(fiat.u1(x31)) + x17)), x38, u64(fiat.u1(x51))) + x54, x55 := bits.add_u64(x48, arg1[3], u64(0x0)) + x56, x57 := bits.add_u64(x50, u64(0x0), u64(fiat.u1(x55))) + x58, x59 := bits.add_u64(x52, u64(0x0), u64(fiat.u1(x57))) + x61, x60 := bits.mul_u64(x54, 0xffffffff00000001) + x63, x62 := bits.mul_u64(x54, 0xffffffff) + x65, x64 := bits.mul_u64(x54, 0xffffffffffffffff) + x66, x67 := bits.add_u64(x65, x62, u64(0x0)) + _, x69 := bits.add_u64(x54, x64, u64(0x0)) + x70, x71 := bits.add_u64(x56, x66, u64(fiat.u1(x69))) + x72, x73 := bits.add_u64(x58, (u64(fiat.u1(x67)) + x63), u64(fiat.u1(x71))) + x74, x75 := bits.add_u64((u64(fiat.u1(x59)) + (u64(fiat.u1(x53)) + x39)), x60, u64(fiat.u1(x73))) + x76 := (u64(fiat.u1(x75)) + x61) + x77, x78 := bits.sub_u64(x70, 0xffffffffffffffff, u64(0x0)) + x79, x80 := bits.sub_u64(x72, 0xffffffff, u64(fiat.u1(x78))) + x81, x82 := bits.sub_u64(x74, u64(0x0), u64(fiat.u1(x80))) + x83, x84 := bits.sub_u64(x76, 0xffffffff00000001, u64(fiat.u1(x82))) + _, x86 := bits.sub_u64(u64(0x0), u64(0x0), u64(fiat.u1(x84))) + x87 := fiat.cmovznz_u64(fiat.u1(x86), x77, x70) + x88 := fiat.cmovznz_u64(fiat.u1(x86), x79, x72) + x89 := fiat.cmovznz_u64(fiat.u1(x86), x81, x74) + x90 := fiat.cmovznz_u64(fiat.u1(x86), x83, x76) + out1[0] = x87 + out1[1] = x88 + out1[2] = x89 + out1[3] = x90 +} + +fe_to_montgomery :: proc "contextless" ( + out1: ^Montgomery_Domain_Field_Element, + arg1: ^Non_Montgomery_Domain_Field_Element, +) { + x1 := arg1[1] + x2 := arg1[2] + x3 := arg1[3] + x4 := arg1[0] + x6, x5 := bits.mul_u64(x4, 0x4fffffffd) + x8, x7 := bits.mul_u64(x4, 0xfffffffffffffffe) + x10, x9 := bits.mul_u64(x4, 0xfffffffbffffffff) + x12, x11 := bits.mul_u64(x4, 0x3) + x13, x14 := bits.add_u64(x12, x9, u64(0x0)) + x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14))) + x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16))) + x20, x19 := bits.mul_u64(x11, 0xffffffff00000001) + x22, x21 := bits.mul_u64(x11, 0xffffffff) + x24, x23 := bits.mul_u64(x11, 0xffffffffffffffff) + x25, x26 := bits.add_u64(x24, x21, u64(0x0)) + _, x28 := bits.add_u64(x11, x23, u64(0x0)) + x29, x30 := bits.add_u64(x13, x25, u64(fiat.u1(x28))) + x31, x32 := bits.add_u64(x15, (u64(fiat.u1(x26)) + x22), u64(fiat.u1(x30))) + x33, x34 := bits.add_u64(x17, x19, u64(fiat.u1(x32))) + x35, x36 := bits.add_u64((u64(fiat.u1(x18)) + x6), x20, u64(fiat.u1(x34))) + x38, x37 := bits.mul_u64(x1, 0x4fffffffd) + x40, x39 := bits.mul_u64(x1, 0xfffffffffffffffe) + x42, x41 := bits.mul_u64(x1, 0xfffffffbffffffff) + x44, x43 := bits.mul_u64(x1, 0x3) + x45, x46 := bits.add_u64(x44, x41, u64(0x0)) + x47, x48 := bits.add_u64(x42, x39, u64(fiat.u1(x46))) + x49, x50 := bits.add_u64(x40, x37, u64(fiat.u1(x48))) + x51, x52 := bits.add_u64(x29, x43, u64(0x0)) + x53, x54 := bits.add_u64(x31, x45, u64(fiat.u1(x52))) + x55, x56 := bits.add_u64(x33, x47, u64(fiat.u1(x54))) + x57, x58 := bits.add_u64(x35, x49, u64(fiat.u1(x56))) + x60, x59 := bits.mul_u64(x51, 0xffffffff00000001) + x62, x61 := bits.mul_u64(x51, 0xffffffff) + x64, x63 := bits.mul_u64(x51, 0xffffffffffffffff) + x65, x66 := bits.add_u64(x64, x61, u64(0x0)) + _, x68 := bits.add_u64(x51, x63, u64(0x0)) + x69, x70 := bits.add_u64(x53, x65, u64(fiat.u1(x68))) + x71, x72 := bits.add_u64(x55, (u64(fiat.u1(x66)) + x62), u64(fiat.u1(x70))) + x73, x74 := bits.add_u64(x57, x59, u64(fiat.u1(x72))) + x75, x76 := bits.add_u64(((u64(fiat.u1(x58)) + u64(fiat.u1(x36))) + (u64(fiat.u1(x50)) + x38)), x60, u64(fiat.u1(x74))) + x78, x77 := bits.mul_u64(x2, 0x4fffffffd) + x80, x79 := bits.mul_u64(x2, 0xfffffffffffffffe) + x82, x81 := bits.mul_u64(x2, 0xfffffffbffffffff) + x84, x83 := bits.mul_u64(x2, 0x3) + x85, x86 := bits.add_u64(x84, x81, u64(0x0)) + x87, x88 := bits.add_u64(x82, x79, u64(fiat.u1(x86))) + x89, x90 := bits.add_u64(x80, x77, u64(fiat.u1(x88))) + x91, x92 := bits.add_u64(x69, x83, u64(0x0)) + x93, x94 := bits.add_u64(x71, x85, u64(fiat.u1(x92))) + x95, x96 := bits.add_u64(x73, x87, u64(fiat.u1(x94))) + x97, x98 := bits.add_u64(x75, x89, u64(fiat.u1(x96))) + x100, x99 := bits.mul_u64(x91, 0xffffffff00000001) + x102, x101 := bits.mul_u64(x91, 0xffffffff) + x104, x103 := bits.mul_u64(x91, 0xffffffffffffffff) + x105, x106 := bits.add_u64(x104, x101, u64(0x0)) + _, x108 := bits.add_u64(x91, x103, u64(0x0)) + x109, x110 := bits.add_u64(x93, x105, u64(fiat.u1(x108))) + x111, x112 := bits.add_u64(x95, (u64(fiat.u1(x106)) + x102), u64(fiat.u1(x110))) + x113, x114 := bits.add_u64(x97, x99, u64(fiat.u1(x112))) + x115, x116 := bits.add_u64(((u64(fiat.u1(x98)) + u64(fiat.u1(x76))) + (u64(fiat.u1(x90)) + x78)), x100, u64(fiat.u1(x114))) + x118, x117 := bits.mul_u64(x3, 0x4fffffffd) + x120, x119 := bits.mul_u64(x3, 0xfffffffffffffffe) + x122, x121 := bits.mul_u64(x3, 0xfffffffbffffffff) + x124, x123 := bits.mul_u64(x3, 0x3) + x125, x126 := bits.add_u64(x124, x121, u64(0x0)) + x127, x128 := bits.add_u64(x122, x119, u64(fiat.u1(x126))) + x129, x130 := bits.add_u64(x120, x117, u64(fiat.u1(x128))) + x131, x132 := bits.add_u64(x109, x123, u64(0x0)) + x133, x134 := bits.add_u64(x111, x125, u64(fiat.u1(x132))) + x135, x136 := bits.add_u64(x113, x127, u64(fiat.u1(x134))) + x137, x138 := bits.add_u64(x115, x129, u64(fiat.u1(x136))) + x140, x139 := bits.mul_u64(x131, 0xffffffff00000001) + x142, x141 := bits.mul_u64(x131, 0xffffffff) + x144, x143 := bits.mul_u64(x131, 0xffffffffffffffff) + x145, x146 := bits.add_u64(x144, x141, u64(0x0)) + _, x148 := bits.add_u64(x131, x143, u64(0x0)) + x149, x150 := bits.add_u64(x133, x145, u64(fiat.u1(x148))) + x151, x152 := bits.add_u64(x135, (u64(fiat.u1(x146)) + x142), u64(fiat.u1(x150))) + x153, x154 := bits.add_u64(x137, x139, u64(fiat.u1(x152))) + x155, x156 := bits.add_u64(((u64(fiat.u1(x138)) + u64(fiat.u1(x116))) + (u64(fiat.u1(x130)) + x118)), x140, u64(fiat.u1(x154))) + x157, x158 := bits.sub_u64(x149, 0xffffffffffffffff, u64(0x0)) + x159, x160 := bits.sub_u64(x151, 0xffffffff, u64(fiat.u1(x158))) + x161, x162 := bits.sub_u64(x153, u64(0x0), u64(fiat.u1(x160))) + x163, x164 := bits.sub_u64(x155, 0xffffffff00000001, u64(fiat.u1(x162))) + _, x166 := bits.sub_u64(u64(fiat.u1(x156)), u64(0x0), u64(fiat.u1(x164))) + x167 := fiat.cmovznz_u64(fiat.u1(x166), x157, x149) + x168 := fiat.cmovznz_u64(fiat.u1(x166), x159, x151) + x169 := fiat.cmovznz_u64(fiat.u1(x166), x161, x153) + x170 := fiat.cmovznz_u64(fiat.u1(x166), x163, x155) + out1[0] = x167 + out1[1] = x168 + out1[2] = x169 + out1[3] = x170 +} diff --git a/core/crypto/_fiat/field_scalarp256r1/field.odin b/core/crypto/_fiat/field_scalarp256r1/field.odin new file mode 100644 index 00000000000..4ee9b173070 --- /dev/null +++ b/core/crypto/_fiat/field_scalarp256r1/field.odin @@ -0,0 +1,210 @@ +package field_scalarp256r1 + +import "core:encoding/endian" +import "core:math/bits" +import "core:mem" + +@(private, rodata) +TWO_192 := Montgomery_Domain_Field_Element{ + 2482910415990817935, + 2879494685571067143, + 8732918506673730078, + 85565669603516024, +} +@(private, rodata) +TWO_384 := Montgomery_Domain_Field_Element{ + 4249636649191695722, + 15824622900665792993, + 2045563599676998936, + 8979594627705675929, +} // 2^384 % p (From sage) + +fe_clear :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) { + mem.zero_explicit(arg1, size_of(Montgomery_Domain_Field_Element)) +} + +fe_clear_vec :: proc "contextless" ( + arg1: []^Montgomery_Domain_Field_Element, +) { + for fe in arg1 { + fe_clear(fe) + } +} + +fe_from_bytes :: proc "contextless" ( + out1: ^Montgomery_Domain_Field_Element, + arg1: []byte, +) -> bool { + ensure_contextless(len(out1) <= 64, "p256r1: invalid scalar input buffer") + + is_canonical := false + s_len := len(arg1) + switch { + case s_len < 32: + // No way this can be greater than the order. + fe_unchecked_set(out1, arg1) + is_canonical = true + case s_len == 32: + // It is quite likely that a reduction mod p is required, + // as the order of the curve is sufficiently smaller than + // 2^256-1, so just check if we actually needed to reduced + // and do the reduction anyway, so that things that require + // canonical scalars can reject non-canonical encodings. + is_canonical = fe_is_canonical(arg1) + fallthrough + case: + // Use Frank Denis' trick, as documented by Filippo Valsorda + // at https://words.filippo.io/dispatches/wide-reduction/ + // + // "I represent the value as a+b*2^192+c*2^384" + // + // Note: Omitting the `c` computation is fine as, the only + // realistic sizes of the raw scalar are going to be 256 or + // 512-bits. + + // Zero extend to 512-bits. + src_512: [64]byte + copy(src_512[64-s_len:], arg1) + defer mem.zero_explicit(&src_512, size_of(src_512)) + + fe_unchecked_set(out1, src_512[40:]) // a + b: Montgomery_Domain_Field_Element + fe_unchecked_set(&b, src_512[16:40]) // b + + fe_mul(&b, &b, &TWO_192) + fe_add(out1, out1, &b) + if s_len >= 48 { + c: Montgomery_Domain_Field_Element + fe_unchecked_set(&c, src_512[:16]) // c + fe_mul(&c, &c, &TWO_384) + fe_add(out1, out1, &c) + + fe_clear(&c) + } + + fe_clear(&b) + } + + return !is_canonical +} + +@(private) +fe_is_canonical :: proc "contextless" (arg1: []byte) -> bool { + _, borrow := bits.sub_u64(ELL[0] - 1, endian.unchecked_get_u64le(arg1[24:]), 0) + _, borrow = bits.sub_u64(ELL[1], endian.unchecked_get_u64le(arg1[16:]), borrow) + _, borrow = bits.sub_u64(ELL[2], endian.unchecked_get_u64le(arg1[8:]), borrow) + _, borrow = bits.sub_u64(ELL[3], endian.unchecked_get_u64le(arg1[0:]), borrow) + return borrow == 0 +} + +@(private) +fe_unchecked_set :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element, arg1: []byte) { + arg1_256: [32]byte + defer mem.zero_explicit(&arg1_256, size_of(arg1_256)) + copy(arg1_256[32-len(arg1):], arg1) + + tmp := Non_Montgomery_Domain_Field_Element { + endian.unchecked_get_u64le(arg1_256[24:]), + endian.unchecked_get_u64le(arg1_256[16:]), + endian.unchecked_get_u64le(arg1_256[8:]), + endian.unchecked_get_u64le(arg1_256[0:]), + } + defer mem.zero_explicit(&tmp, size_of(tmp)) + + fe_to_montgomery(out1, &tmp) +} + +fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) { + ensure_contextless(len(out1) == 32, "p256r1: invalid scalar output buffer") + + tmp: Non_Montgomery_Domain_Field_Element + fe_from_montgomery(&tmp, arg1) + + // Note: Likewise, output in big-endian. + endian.unchecked_put_u64le(out1[24:], tmp[0]) + endian.unchecked_put_u64le(out1[16:], tmp[1]) + endian.unchecked_put_u64le(out1[8:], tmp[2]) + endian.unchecked_put_u64le(out1[0:], tmp[3]) + + mem.zero_explicit(&tmp, size_of(tmp)) +} + +fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> int { + tmp: Montgomery_Domain_Field_Element + fe_sub(&tmp, arg1, arg2) + + // This will only underflow iff arg1 == arg2, and we return the borrow, + // which will be 1. + _, borrow := bits.sub_u64(fe_non_zero(&tmp), 1, 0) + + fe_clear(&tmp) + + return int(borrow) +} + +fe_is_odd :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> int { + tmp: Non_Montgomery_Domain_Field_Element + defer mem.zero_explicit(&tmp, size_of(tmp)) + + fe_from_montgomery(&tmp, arg1) + return int(tmp[0] & 1) +} + +fe_zero :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) { + out1[0] = 0 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 +} + +fe_set :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 +} + +@(optimization_mode = "none") +fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Montgomery_Domain_Field_Element, arg1: int) { + mask := (u64(arg1) * 0xffffffffffffffff) + x := (out1[0] ~ out2[0]) & mask + x1, y1 := out1[0] ~ x, out2[0] ~ x + x = (out1[1] ~ out2[1]) & mask + x2, y2 := out1[1] ~ x, out2[1] ~ x + x = (out1[2] ~ out2[2]) & mask + x3, y3 := out1[2] ~ x, out2[2] ~ x + x = (out1[3] ~ out2[3]) & mask + x4, y4 := out1[3] ~ x, out2[3] ~ x + out1[0], out2[0] = x1, y1 + out1[1], out2[1] = x2, y2 + out1[2], out2[2] = x3, y3 + out1[3], out2[3] = x4, y4 +} + +@(optimization_mode = "none") +fe_cond_select :: #force_no_inline proc "contextless" ( + out1, arg1, arg2: ^Montgomery_Domain_Field_Element, + arg3: int, +) { + mask := (u64(arg3) * 0xffffffffffffffff) + x1 := ((mask & arg2[0]) | ((~mask) & arg1[0])) + x2 := ((mask & arg2[1]) | ((~mask) & arg1[1])) + x3 := ((mask & arg2[2]) | ((~mask) & arg1[2])) + x4 := ((mask & arg2[3]) | ((~mask) & arg1[3])) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 +} + +fe_cond_negate :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element, ctrl: int) { + tmp1: Montgomery_Domain_Field_Element = --- + fe_opp(&tmp1, arg1) + fe_cond_select(out1, arg1, &tmp1, ctrl) + + fe_clear(&tmp1) +} diff --git a/core/crypto/_fiat/field_scalarp256r1/field64.odin b/core/crypto/_fiat/field_scalarp256r1/field64.odin new file mode 100644 index 00000000000..607cb090280 --- /dev/null +++ b/core/crypto/_fiat/field_scalarp256r1/field64.odin @@ -0,0 +1,569 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package field_scalarp256r1 + +// The file provides arithmetic on the field Z/(2^252+27742317777372353535851937790883648493) +// using a 64-bit Montgomery form internal representation. It is derived +// primarily from the machine generated Golang output from the fiat-crypto +// project. +// +// While the base implementation is provably correct, this implementation +// makes no such claims as the port and optimizations were done by hand. +// +// WARNING: While big-endian is the common representation used for this +// curve, the fiat output uses least-significant-limb first. + +import fiat "core:crypto/_fiat" +import "core:math/bits" + +// ELL is the saturated representation of the field order, least-significant +// limb first. +ELL :: [4]u64{0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000} + +Montgomery_Domain_Field_Element :: distinct [4]u64 +Non_Montgomery_Domain_Field_Element :: distinct [4]u64 + +fe_mul :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) { + x1 := arg1[1] + x2 := arg1[2] + x3 := arg1[3] + x4 := arg1[0] + x6, x5 := bits.mul_u64(x4, arg2[3]) + x8, x7 := bits.mul_u64(x4, arg2[2]) + x10, x9 := bits.mul_u64(x4, arg2[1]) + x12, x11 := bits.mul_u64(x4, arg2[0]) + x13, x14 := bits.add_u64(x12, x9, u64(0x0)) + x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14))) + x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16))) + x19 := (u64(fiat.u1(x18)) + x6) + _, x20 := bits.mul_u64(x11, 0xccd1c8aaee00bc4f) + x23, x22 := bits.mul_u64(x20, 0xffffffff00000000) + x25, x24 := bits.mul_u64(x20, 0xffffffffffffffff) + x27, x26 := bits.mul_u64(x20, 0xbce6faada7179e84) + x29, x28 := bits.mul_u64(x20, 0xf3b9cac2fc632551) + x30, x31 := bits.add_u64(x29, x26, u64(0x0)) + x32, x33 := bits.add_u64(x27, x24, u64(fiat.u1(x31))) + x34, x35 := bits.add_u64(x25, x22, u64(fiat.u1(x33))) + x36 := (u64(fiat.u1(x35)) + x23) + _, x38 := bits.add_u64(x11, x28, u64(0x0)) + x39, x40 := bits.add_u64(x13, x30, u64(fiat.u1(x38))) + x41, x42 := bits.add_u64(x15, x32, u64(fiat.u1(x40))) + x43, x44 := bits.add_u64(x17, x34, u64(fiat.u1(x42))) + x45, x46 := bits.add_u64(x19, x36, u64(fiat.u1(x44))) + x48, x47 := bits.mul_u64(x1, arg2[3]) + x50, x49 := bits.mul_u64(x1, arg2[2]) + x52, x51 := bits.mul_u64(x1, arg2[1]) + x54, x53 := bits.mul_u64(x1, arg2[0]) + x55, x56 := bits.add_u64(x54, x51, u64(0x0)) + x57, x58 := bits.add_u64(x52, x49, u64(fiat.u1(x56))) + x59, x60 := bits.add_u64(x50, x47, u64(fiat.u1(x58))) + x61 := (u64(fiat.u1(x60)) + x48) + x62, x63 := bits.add_u64(x39, x53, u64(0x0)) + x64, x65 := bits.add_u64(x41, x55, u64(fiat.u1(x63))) + x66, x67 := bits.add_u64(x43, x57, u64(fiat.u1(x65))) + x68, x69 := bits.add_u64(x45, x59, u64(fiat.u1(x67))) + x70, x71 := bits.add_u64(u64(fiat.u1(x46)), x61, u64(fiat.u1(x69))) + _, x72 := bits.mul_u64(x62, 0xccd1c8aaee00bc4f) + x75, x74 := bits.mul_u64(x72, 0xffffffff00000000) + x77, x76 := bits.mul_u64(x72, 0xffffffffffffffff) + x79, x78 := bits.mul_u64(x72, 0xbce6faada7179e84) + x81, x80 := bits.mul_u64(x72, 0xf3b9cac2fc632551) + x82, x83 := bits.add_u64(x81, x78, u64(0x0)) + x84, x85 := bits.add_u64(x79, x76, u64(fiat.u1(x83))) + x86, x87 := bits.add_u64(x77, x74, u64(fiat.u1(x85))) + x88 := (u64(fiat.u1(x87)) + x75) + _, x90 := bits.add_u64(x62, x80, u64(0x0)) + x91, x92 := bits.add_u64(x64, x82, u64(fiat.u1(x90))) + x93, x94 := bits.add_u64(x66, x84, u64(fiat.u1(x92))) + x95, x96 := bits.add_u64(x68, x86, u64(fiat.u1(x94))) + x97, x98 := bits.add_u64(x70, x88, u64(fiat.u1(x96))) + x99 := (u64(fiat.u1(x98)) + u64(fiat.u1(x71))) + x101, x100 := bits.mul_u64(x2, arg2[3]) + x103, x102 := bits.mul_u64(x2, arg2[2]) + x105, x104 := bits.mul_u64(x2, arg2[1]) + x107, x106 := bits.mul_u64(x2, arg2[0]) + x108, x109 := bits.add_u64(x107, x104, u64(0x0)) + x110, x111 := bits.add_u64(x105, x102, u64(fiat.u1(x109))) + x112, x113 := bits.add_u64(x103, x100, u64(fiat.u1(x111))) + x114 := (u64(fiat.u1(x113)) + x101) + x115, x116 := bits.add_u64(x91, x106, u64(0x0)) + x117, x118 := bits.add_u64(x93, x108, u64(fiat.u1(x116))) + x119, x120 := bits.add_u64(x95, x110, u64(fiat.u1(x118))) + x121, x122 := bits.add_u64(x97, x112, u64(fiat.u1(x120))) + x123, x124 := bits.add_u64(x99, x114, u64(fiat.u1(x122))) + _, x125 := bits.mul_u64(x115, 0xccd1c8aaee00bc4f) + x128, x127 := bits.mul_u64(x125, 0xffffffff00000000) + x130, x129 := bits.mul_u64(x125, 0xffffffffffffffff) + x132, x131 := bits.mul_u64(x125, 0xbce6faada7179e84) + x134, x133 := bits.mul_u64(x125, 0xf3b9cac2fc632551) + x135, x136 := bits.add_u64(x134, x131, u64(0x0)) + x137, x138 := bits.add_u64(x132, x129, u64(fiat.u1(x136))) + x139, x140 := bits.add_u64(x130, x127, u64(fiat.u1(x138))) + x141 := (u64(fiat.u1(x140)) + x128) + _, x143 := bits.add_u64(x115, x133, u64(0x0)) + x144, x145 := bits.add_u64(x117, x135, u64(fiat.u1(x143))) + x146, x147 := bits.add_u64(x119, x137, u64(fiat.u1(x145))) + x148, x149 := bits.add_u64(x121, x139, u64(fiat.u1(x147))) + x150, x151 := bits.add_u64(x123, x141, u64(fiat.u1(x149))) + x152 := (u64(fiat.u1(x151)) + u64(fiat.u1(x124))) + x154, x153 := bits.mul_u64(x3, arg2[3]) + x156, x155 := bits.mul_u64(x3, arg2[2]) + x158, x157 := bits.mul_u64(x3, arg2[1]) + x160, x159 := bits.mul_u64(x3, arg2[0]) + x161, x162 := bits.add_u64(x160, x157, u64(0x0)) + x163, x164 := bits.add_u64(x158, x155, u64(fiat.u1(x162))) + x165, x166 := bits.add_u64(x156, x153, u64(fiat.u1(x164))) + x167 := (u64(fiat.u1(x166)) + x154) + x168, x169 := bits.add_u64(x144, x159, u64(0x0)) + x170, x171 := bits.add_u64(x146, x161, u64(fiat.u1(x169))) + x172, x173 := bits.add_u64(x148, x163, u64(fiat.u1(x171))) + x174, x175 := bits.add_u64(x150, x165, u64(fiat.u1(x173))) + x176, x177 := bits.add_u64(x152, x167, u64(fiat.u1(x175))) + _, x178 := bits.mul_u64(x168, 0xccd1c8aaee00bc4f) + x181, x180 := bits.mul_u64(x178, 0xffffffff00000000) + x183, x182 := bits.mul_u64(x178, 0xffffffffffffffff) + x185, x184 := bits.mul_u64(x178, 0xbce6faada7179e84) + x187, x186 := bits.mul_u64(x178, 0xf3b9cac2fc632551) + x188, x189 := bits.add_u64(x187, x184, u64(0x0)) + x190, x191 := bits.add_u64(x185, x182, u64(fiat.u1(x189))) + x192, x193 := bits.add_u64(x183, x180, u64(fiat.u1(x191))) + x194 := (u64(fiat.u1(x193)) + x181) + _, x196 := bits.add_u64(x168, x186, u64(0x0)) + x197, x198 := bits.add_u64(x170, x188, u64(fiat.u1(x196))) + x199, x200 := bits.add_u64(x172, x190, u64(fiat.u1(x198))) + x201, x202 := bits.add_u64(x174, x192, u64(fiat.u1(x200))) + x203, x204 := bits.add_u64(x176, x194, u64(fiat.u1(x202))) + x205 := (u64(fiat.u1(x204)) + u64(fiat.u1(x177))) + x206, x207 := bits.sub_u64(x197, 0xf3b9cac2fc632551, u64(0x0)) + x208, x209 := bits.sub_u64(x199, 0xbce6faada7179e84, u64(fiat.u1(x207))) + x210, x211 := bits.sub_u64(x201, 0xffffffffffffffff, u64(fiat.u1(x209))) + x212, x213 := bits.sub_u64(x203, 0xffffffff00000000, u64(fiat.u1(x211))) + _, x215 := bits.sub_u64(x205, u64(0x0), u64(fiat.u1(x213))) + x216 := fiat.cmovznz_u64(fiat.u1(x215), x206, x197) + x217 := fiat.cmovznz_u64(fiat.u1(x215), x208, x199) + x218 := fiat.cmovznz_u64(fiat.u1(x215), x210, x201) + x219 := fiat.cmovznz_u64(fiat.u1(x215), x212, x203) + out1[0] = x216 + out1[1] = x217 + out1[2] = x218 + out1[3] = x219 +} + +fe_square :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + x1 := arg1[1] + x2 := arg1[2] + x3 := arg1[3] + x4 := arg1[0] + x6, x5 := bits.mul_u64(x4, arg1[3]) + x8, x7 := bits.mul_u64(x4, arg1[2]) + x10, x9 := bits.mul_u64(x4, arg1[1]) + x12, x11 := bits.mul_u64(x4, arg1[0]) + x13, x14 := bits.add_u64(x12, x9, u64(0x0)) + x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14))) + x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16))) + x19 := (u64(fiat.u1(x18)) + x6) + _, x20 := bits.mul_u64(x11, 0xccd1c8aaee00bc4f) + x23, x22 := bits.mul_u64(x20, 0xffffffff00000000) + x25, x24 := bits.mul_u64(x20, 0xffffffffffffffff) + x27, x26 := bits.mul_u64(x20, 0xbce6faada7179e84) + x29, x28 := bits.mul_u64(x20, 0xf3b9cac2fc632551) + x30, x31 := bits.add_u64(x29, x26, u64(0x0)) + x32, x33 := bits.add_u64(x27, x24, u64(fiat.u1(x31))) + x34, x35 := bits.add_u64(x25, x22, u64(fiat.u1(x33))) + x36 := (u64(fiat.u1(x35)) + x23) + _, x38 := bits.add_u64(x11, x28, u64(0x0)) + x39, x40 := bits.add_u64(x13, x30, u64(fiat.u1(x38))) + x41, x42 := bits.add_u64(x15, x32, u64(fiat.u1(x40))) + x43, x44 := bits.add_u64(x17, x34, u64(fiat.u1(x42))) + x45, x46 := bits.add_u64(x19, x36, u64(fiat.u1(x44))) + x48, x47 := bits.mul_u64(x1, arg1[3]) + x50, x49 := bits.mul_u64(x1, arg1[2]) + x52, x51 := bits.mul_u64(x1, arg1[1]) + x54, x53 := bits.mul_u64(x1, arg1[0]) + x55, x56 := bits.add_u64(x54, x51, u64(0x0)) + x57, x58 := bits.add_u64(x52, x49, u64(fiat.u1(x56))) + x59, x60 := bits.add_u64(x50, x47, u64(fiat.u1(x58))) + x61 := (u64(fiat.u1(x60)) + x48) + x62, x63 := bits.add_u64(x39, x53, u64(0x0)) + x64, x65 := bits.add_u64(x41, x55, u64(fiat.u1(x63))) + x66, x67 := bits.add_u64(x43, x57, u64(fiat.u1(x65))) + x68, x69 := bits.add_u64(x45, x59, u64(fiat.u1(x67))) + x70, x71 := bits.add_u64(u64(fiat.u1(x46)), x61, u64(fiat.u1(x69))) + _, x72 := bits.mul_u64(x62, 0xccd1c8aaee00bc4f) + x75, x74 := bits.mul_u64(x72, 0xffffffff00000000) + x77, x76 := bits.mul_u64(x72, 0xffffffffffffffff) + x79, x78 := bits.mul_u64(x72, 0xbce6faada7179e84) + x81, x80 := bits.mul_u64(x72, 0xf3b9cac2fc632551) + x82, x83 := bits.add_u64(x81, x78, u64(0x0)) + x84, x85 := bits.add_u64(x79, x76, u64(fiat.u1(x83))) + x86, x87 := bits.add_u64(x77, x74, u64(fiat.u1(x85))) + x88 := (u64(fiat.u1(x87)) + x75) + _, x90 := bits.add_u64(x62, x80, u64(0x0)) + x91, x92 := bits.add_u64(x64, x82, u64(fiat.u1(x90))) + x93, x94 := bits.add_u64(x66, x84, u64(fiat.u1(x92))) + x95, x96 := bits.add_u64(x68, x86, u64(fiat.u1(x94))) + x97, x98 := bits.add_u64(x70, x88, u64(fiat.u1(x96))) + x99 := (u64(fiat.u1(x98)) + u64(fiat.u1(x71))) + x101, x100 := bits.mul_u64(x2, arg1[3]) + x103, x102 := bits.mul_u64(x2, arg1[2]) + x105, x104 := bits.mul_u64(x2, arg1[1]) + x107, x106 := bits.mul_u64(x2, arg1[0]) + x108, x109 := bits.add_u64(x107, x104, u64(0x0)) + x110, x111 := bits.add_u64(x105, x102, u64(fiat.u1(x109))) + x112, x113 := bits.add_u64(x103, x100, u64(fiat.u1(x111))) + x114 := (u64(fiat.u1(x113)) + x101) + x115, x116 := bits.add_u64(x91, x106, u64(0x0)) + x117, x118 := bits.add_u64(x93, x108, u64(fiat.u1(x116))) + x119, x120 := bits.add_u64(x95, x110, u64(fiat.u1(x118))) + x121, x122 := bits.add_u64(x97, x112, u64(fiat.u1(x120))) + x123, x124 := bits.add_u64(x99, x114, u64(fiat.u1(x122))) + _, x125 := bits.mul_u64(x115, 0xccd1c8aaee00bc4f) + x128, x127 := bits.mul_u64(x125, 0xffffffff00000000) + x130, x129 := bits.mul_u64(x125, 0xffffffffffffffff) + x132, x131 := bits.mul_u64(x125, 0xbce6faada7179e84) + x134, x133 := bits.mul_u64(x125, 0xf3b9cac2fc632551) + x135, x136 := bits.add_u64(x134, x131, u64(0x0)) + x137, x138 := bits.add_u64(x132, x129, u64(fiat.u1(x136))) + x139, x140 := bits.add_u64(x130, x127, u64(fiat.u1(x138))) + x141 := (u64(fiat.u1(x140)) + x128) + _, x143 := bits.add_u64(x115, x133, u64(0x0)) + x144, x145 := bits.add_u64(x117, x135, u64(fiat.u1(x143))) + x146, x147 := bits.add_u64(x119, x137, u64(fiat.u1(x145))) + x148, x149 := bits.add_u64(x121, x139, u64(fiat.u1(x147))) + x150, x151 := bits.add_u64(x123, x141, u64(fiat.u1(x149))) + x152 := (u64(fiat.u1(x151)) + u64(fiat.u1(x124))) + x154, x153 := bits.mul_u64(x3, arg1[3]) + x156, x155 := bits.mul_u64(x3, arg1[2]) + x158, x157 := bits.mul_u64(x3, arg1[1]) + x160, x159 := bits.mul_u64(x3, arg1[0]) + x161, x162 := bits.add_u64(x160, x157, u64(0x0)) + x163, x164 := bits.add_u64(x158, x155, u64(fiat.u1(x162))) + x165, x166 := bits.add_u64(x156, x153, u64(fiat.u1(x164))) + x167 := (u64(fiat.u1(x166)) + x154) + x168, x169 := bits.add_u64(x144, x159, u64(0x0)) + x170, x171 := bits.add_u64(x146, x161, u64(fiat.u1(x169))) + x172, x173 := bits.add_u64(x148, x163, u64(fiat.u1(x171))) + x174, x175 := bits.add_u64(x150, x165, u64(fiat.u1(x173))) + x176, x177 := bits.add_u64(x152, x167, u64(fiat.u1(x175))) + _, x178 := bits.mul_u64(x168, 0xccd1c8aaee00bc4f) + x181, x180 := bits.mul_u64(x178, 0xffffffff00000000) + x183, x182 := bits.mul_u64(x178, 0xffffffffffffffff) + x185, x184 := bits.mul_u64(x178, 0xbce6faada7179e84) + x187, x186 := bits.mul_u64(x178, 0xf3b9cac2fc632551) + x188, x189 := bits.add_u64(x187, x184, u64(0x0)) + x190, x191 := bits.add_u64(x185, x182, u64(fiat.u1(x189))) + x192, x193 := bits.add_u64(x183, x180, u64(fiat.u1(x191))) + x194 := (u64(fiat.u1(x193)) + x181) + _, x196 := bits.add_u64(x168, x186, u64(0x0)) + x197, x198 := bits.add_u64(x170, x188, u64(fiat.u1(x196))) + x199, x200 := bits.add_u64(x172, x190, u64(fiat.u1(x198))) + x201, x202 := bits.add_u64(x174, x192, u64(fiat.u1(x200))) + x203, x204 := bits.add_u64(x176, x194, u64(fiat.u1(x202))) + x205 := (u64(fiat.u1(x204)) + u64(fiat.u1(x177))) + x206, x207 := bits.sub_u64(x197, 0xf3b9cac2fc632551, u64(0x0)) + x208, x209 := bits.sub_u64(x199, 0xbce6faada7179e84, u64(fiat.u1(x207))) + x210, x211 := bits.sub_u64(x201, 0xffffffffffffffff, u64(fiat.u1(x209))) + x212, x213 := bits.sub_u64(x203, 0xffffffff00000000, u64(fiat.u1(x211))) + _, x215 := bits.sub_u64(x205, u64(0x0), u64(fiat.u1(x213))) + x216 := fiat.cmovznz_u64(fiat.u1(x215), x206, x197) + x217 := fiat.cmovznz_u64(fiat.u1(x215), x208, x199) + x218 := fiat.cmovznz_u64(fiat.u1(x215), x210, x201) + x219 := fiat.cmovznz_u64(fiat.u1(x215), x212, x203) + out1[0] = x216 + out1[1] = x217 + out1[2] = x218 + out1[3] = x219 +} + +fe_add :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) { + x1, x2 := bits.add_u64(arg1[0], arg2[0], u64(0x0)) + x3, x4 := bits.add_u64(arg1[1], arg2[1], u64(fiat.u1(x2))) + x5, x6 := bits.add_u64(arg1[2], arg2[2], u64(fiat.u1(x4))) + x7, x8 := bits.add_u64(arg1[3], arg2[3], u64(fiat.u1(x6))) + x9, x10 := bits.sub_u64(x1, 0xf3b9cac2fc632551, u64(0x0)) + x11, x12 := bits.sub_u64(x3, 0xbce6faada7179e84, u64(fiat.u1(x10))) + x13, x14 := bits.sub_u64(x5, 0xffffffffffffffff, u64(fiat.u1(x12))) + x15, x16 := bits.sub_u64(x7, 0xffffffff00000000, u64(fiat.u1(x14))) + _, x18 := bits.sub_u64(u64(fiat.u1(x8)), u64(0x0), u64(fiat.u1(x16))) + x19 := fiat.cmovznz_u64(fiat.u1(x18), x9, x1) + x20 := fiat.cmovznz_u64(fiat.u1(x18), x11, x3) + x21 := fiat.cmovznz_u64(fiat.u1(x18), x13, x5) + x22 := fiat.cmovznz_u64(fiat.u1(x18), x15, x7) + out1[0] = x19 + out1[1] = x20 + out1[2] = x21 + out1[3] = x22 +} + +fe_sub :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) { + x1, x2 := bits.sub_u64(arg1[0], arg2[0], u64(0x0)) + x3, x4 := bits.sub_u64(arg1[1], arg2[1], u64(fiat.u1(x2))) + x5, x6 := bits.sub_u64(arg1[2], arg2[2], u64(fiat.u1(x4))) + x7, x8 := bits.sub_u64(arg1[3], arg2[3], u64(fiat.u1(x6))) + x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff) + x10, x11 := bits.add_u64(x1, (x9 & 0xf3b9cac2fc632551), u64(0x0)) + x12, x13 := bits.add_u64(x3, (x9 & 0xbce6faada7179e84), u64(fiat.u1(x11))) + x14, x15 := bits.add_u64(x5, x9, u64(fiat.u1(x13))) + x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000000), u64(fiat.u1(x15))) + out1[0] = x10 + out1[1] = x12 + out1[2] = x14 + out1[3] = x16 +} + +fe_opp :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) { + x1, x2 := bits.sub_u64(u64(0x0), arg1[0], u64(0x0)) + x3, x4 := bits.sub_u64(u64(0x0), arg1[1], u64(fiat.u1(x2))) + x5, x6 := bits.sub_u64(u64(0x0), arg1[2], u64(fiat.u1(x4))) + x7, x8 := bits.sub_u64(u64(0x0), arg1[3], u64(fiat.u1(x6))) + x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff) + x10, x11 := bits.add_u64(x1, (x9 & 0xf3b9cac2fc632551), u64(0x0)) + x12, x13 := bits.add_u64(x3, (x9 & 0xbce6faada7179e84), u64(fiat.u1(x11))) + x14, x15 := bits.add_u64(x5, x9, u64(fiat.u1(x13))) + x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000000), u64(fiat.u1(x15))) + out1[0] = x10 + out1[1] = x12 + out1[2] = x14 + out1[3] = x16 +} + +fe_one :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) { + out1[0] = 0xc46353d039cdaaf + out1[1] = 0x4319055258e8617b + out1[2] = u64(0x0) + out1[3] = 0xffffffff +} + +fe_non_zero :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> u64 { + return arg1[0] | (arg1[1] | (arg1[2] | arg1[3])) +} + +@(optimization_mode = "none") +fe_cond_assign :: #force_no_inline proc "contextless" ( + out1, arg1: ^Montgomery_Domain_Field_Element, + arg2: int, +) { + x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0]) + x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1]) + x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2]) + x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 +} + +fe_from_montgomery :: proc "contextless" ( + out1: ^Non_Montgomery_Domain_Field_Element, + arg1: ^Montgomery_Domain_Field_Element, +) { + x1 := arg1[0] + _, x2 := bits.mul_u64(x1, 0xccd1c8aaee00bc4f) + x5, x4 := bits.mul_u64(x2, 0xffffffff00000000) + x7, x6 := bits.mul_u64(x2, 0xffffffffffffffff) + x9, x8 := bits.mul_u64(x2, 0xbce6faada7179e84) + x11, x10 := bits.mul_u64(x2, 0xf3b9cac2fc632551) + x12, x13 := bits.add_u64(x11, x8, u64(0x0)) + x14, x15 := bits.add_u64(x9, x6, u64(fiat.u1(x13))) + x16, x17 := bits.add_u64(x7, x4, u64(fiat.u1(x15))) + _, x19 := bits.add_u64(x1, x10, u64(0x0)) + x20, x21 := bits.add_u64(u64(0x0), x12, u64(fiat.u1(x19))) + x22, x23 := bits.add_u64(u64(0x0), x14, u64(fiat.u1(x21))) + x24, x25 := bits.add_u64(u64(0x0), x16, u64(fiat.u1(x23))) + x26, x27 := bits.add_u64(x20, arg1[1], u64(0x0)) + x28, x29 := bits.add_u64(x22, u64(0x0), u64(fiat.u1(x27))) + x30, x31 := bits.add_u64(x24, u64(0x0), u64(fiat.u1(x29))) + _, x32 := bits.mul_u64(x26, 0xccd1c8aaee00bc4f) + x35, x34 := bits.mul_u64(x32, 0xffffffff00000000) + x37, x36 := bits.mul_u64(x32, 0xffffffffffffffff) + x39, x38 := bits.mul_u64(x32, 0xbce6faada7179e84) + x41, x40 := bits.mul_u64(x32, 0xf3b9cac2fc632551) + x42, x43 := bits.add_u64(x41, x38, u64(0x0)) + x44, x45 := bits.add_u64(x39, x36, u64(fiat.u1(x43))) + x46, x47 := bits.add_u64(x37, x34, u64(fiat.u1(x45))) + _, x49 := bits.add_u64(x26, x40, u64(0x0)) + x50, x51 := bits.add_u64(x28, x42, u64(fiat.u1(x49))) + x52, x53 := bits.add_u64(x30, x44, u64(fiat.u1(x51))) + x54, x55 := bits.add_u64((u64(fiat.u1(x31)) + (u64(fiat.u1(x25)) + (u64(fiat.u1(x17)) + x5))), x46, u64(fiat.u1(x53))) + x56, x57 := bits.add_u64(x50, arg1[2], u64(0x0)) + x58, x59 := bits.add_u64(x52, u64(0x0), u64(fiat.u1(x57))) + x60, x61 := bits.add_u64(x54, u64(0x0), u64(fiat.u1(x59))) + _, x62 := bits.mul_u64(x56, 0xccd1c8aaee00bc4f) + x65, x64 := bits.mul_u64(x62, 0xffffffff00000000) + x67, x66 := bits.mul_u64(x62, 0xffffffffffffffff) + x69, x68 := bits.mul_u64(x62, 0xbce6faada7179e84) + x71, x70 := bits.mul_u64(x62, 0xf3b9cac2fc632551) + x72, x73 := bits.add_u64(x71, x68, u64(0x0)) + x74, x75 := bits.add_u64(x69, x66, u64(fiat.u1(x73))) + x76, x77 := bits.add_u64(x67, x64, u64(fiat.u1(x75))) + _, x79 := bits.add_u64(x56, x70, u64(0x0)) + x80, x81 := bits.add_u64(x58, x72, u64(fiat.u1(x79))) + x82, x83 := bits.add_u64(x60, x74, u64(fiat.u1(x81))) + x84, x85 := bits.add_u64((u64(fiat.u1(x61)) + (u64(fiat.u1(x55)) + (u64(fiat.u1(x47)) + x35))), x76, u64(fiat.u1(x83))) + x86, x87 := bits.add_u64(x80, arg1[3], u64(0x0)) + x88, x89 := bits.add_u64(x82, u64(0x0), u64(fiat.u1(x87))) + x90, x91 := bits.add_u64(x84, u64(0x0), u64(fiat.u1(x89))) + _, x92 := bits.mul_u64(x86, 0xccd1c8aaee00bc4f) + x95, x94 := bits.mul_u64(x92, 0xffffffff00000000) + x97, x96 := bits.mul_u64(x92, 0xffffffffffffffff) + x99, x98 := bits.mul_u64(x92, 0xbce6faada7179e84) + x101, x100 := bits.mul_u64(x92, 0xf3b9cac2fc632551) + x102, x103 := bits.add_u64(x101, x98, u64(0x0)) + x104, x105 := bits.add_u64(x99, x96, u64(fiat.u1(x103))) + x106, x107 := bits.add_u64(x97, x94, u64(fiat.u1(x105))) + _, x109 := bits.add_u64(x86, x100, u64(0x0)) + x110, x111 := bits.add_u64(x88, x102, u64(fiat.u1(x109))) + x112, x113 := bits.add_u64(x90, x104, u64(fiat.u1(x111))) + x114, x115 := bits.add_u64((u64(fiat.u1(x91)) + (u64(fiat.u1(x85)) + (u64(fiat.u1(x77)) + x65))), x106, u64(fiat.u1(x113))) + x116 := (u64(fiat.u1(x115)) + (u64(fiat.u1(x107)) + x95)) + x117, x118 := bits.sub_u64(x110, 0xf3b9cac2fc632551, u64(0x0)) + x119, x120 := bits.sub_u64(x112, 0xbce6faada7179e84, u64(fiat.u1(x118))) + x121, x122 := bits.sub_u64(x114, 0xffffffffffffffff, u64(fiat.u1(x120))) + x123, x124 := bits.sub_u64(x116, 0xffffffff00000000, u64(fiat.u1(x122))) + _, x126 := bits.sub_u64(u64(0x0), u64(0x0), u64(fiat.u1(x124))) + x127 := fiat.cmovznz_u64(fiat.u1(x126), x117, x110) + x128 := fiat.cmovznz_u64(fiat.u1(x126), x119, x112) + x129 := fiat.cmovznz_u64(fiat.u1(x126), x121, x114) + x130 := fiat.cmovznz_u64(fiat.u1(x126), x123, x116) + out1[0] = x127 + out1[1] = x128 + out1[2] = x129 + out1[3] = x130 +} + +fe_to_montgomery :: proc "contextless" ( + out1: ^Montgomery_Domain_Field_Element, + arg1: ^Non_Montgomery_Domain_Field_Element, +) { + x1 := arg1[1] + x2 := arg1[2] + x3 := arg1[3] + x4 := arg1[0] + x6, x5 := bits.mul_u64(x4, 0x66e12d94f3d95620) + x8, x7 := bits.mul_u64(x4, 0x2845b2392b6bec59) + x10, x9 := bits.mul_u64(x4, 0x4699799c49bd6fa6) + x12, x11 := bits.mul_u64(x4, 0x83244c95be79eea2) + x13, x14 := bits.add_u64(x12, x9, u64(0x0)) + x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14))) + x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16))) + _, x19 := bits.mul_u64(x11, 0xccd1c8aaee00bc4f) + x22, x21 := bits.mul_u64(x19, 0xffffffff00000000) + x24, x23 := bits.mul_u64(x19, 0xffffffffffffffff) + x26, x25 := bits.mul_u64(x19, 0xbce6faada7179e84) + x28, x27 := bits.mul_u64(x19, 0xf3b9cac2fc632551) + x29, x30 := bits.add_u64(x28, x25, u64(0x0)) + x31, x32 := bits.add_u64(x26, x23, u64(fiat.u1(x30))) + x33, x34 := bits.add_u64(x24, x21, u64(fiat.u1(x32))) + _, x36 := bits.add_u64(x11, x27, u64(0x0)) + x37, x38 := bits.add_u64(x13, x29, u64(fiat.u1(x36))) + x39, x40 := bits.add_u64(x15, x31, u64(fiat.u1(x38))) + x41, x42 := bits.add_u64(x17, x33, u64(fiat.u1(x40))) + x43, x44 := bits.add_u64((u64(fiat.u1(x18)) + x6), (u64(fiat.u1(x34)) + x22), u64(fiat.u1(x42))) + x46, x45 := bits.mul_u64(x1, 0x66e12d94f3d95620) + x48, x47 := bits.mul_u64(x1, 0x2845b2392b6bec59) + x50, x49 := bits.mul_u64(x1, 0x4699799c49bd6fa6) + x52, x51 := bits.mul_u64(x1, 0x83244c95be79eea2) + x53, x54 := bits.add_u64(x52, x49, u64(0x0)) + x55, x56 := bits.add_u64(x50, x47, u64(fiat.u1(x54))) + x57, x58 := bits.add_u64(x48, x45, u64(fiat.u1(x56))) + x59, x60 := bits.add_u64(x37, x51, u64(0x0)) + x61, x62 := bits.add_u64(x39, x53, u64(fiat.u1(x60))) + x63, x64 := bits.add_u64(x41, x55, u64(fiat.u1(x62))) + x65, x66 := bits.add_u64(x43, x57, u64(fiat.u1(x64))) + _, x67 := bits.mul_u64(x59, 0xccd1c8aaee00bc4f) + x70, x69 := bits.mul_u64(x67, 0xffffffff00000000) + x72, x71 := bits.mul_u64(x67, 0xffffffffffffffff) + x74, x73 := bits.mul_u64(x67, 0xbce6faada7179e84) + x76, x75 := bits.mul_u64(x67, 0xf3b9cac2fc632551) + x77, x78 := bits.add_u64(x76, x73, u64(0x0)) + x79, x80 := bits.add_u64(x74, x71, u64(fiat.u1(x78))) + x81, x82 := bits.add_u64(x72, x69, u64(fiat.u1(x80))) + _, x84 := bits.add_u64(x59, x75, u64(0x0)) + x85, x86 := bits.add_u64(x61, x77, u64(fiat.u1(x84))) + x87, x88 := bits.add_u64(x63, x79, u64(fiat.u1(x86))) + x89, x90 := bits.add_u64(x65, x81, u64(fiat.u1(x88))) + x91, x92 := bits.add_u64(((u64(fiat.u1(x66)) + u64(fiat.u1(x44))) + (u64(fiat.u1(x58)) + x46)), (u64(fiat.u1(x82)) + x70), u64(fiat.u1(x90))) + x94, x93 := bits.mul_u64(x2, 0x66e12d94f3d95620) + x96, x95 := bits.mul_u64(x2, 0x2845b2392b6bec59) + x98, x97 := bits.mul_u64(x2, 0x4699799c49bd6fa6) + x100, x99 := bits.mul_u64(x2, 0x83244c95be79eea2) + x101, x102 := bits.add_u64(x100, x97, u64(0x0)) + x103, x104 := bits.add_u64(x98, x95, u64(fiat.u1(x102))) + x105, x106 := bits.add_u64(x96, x93, u64(fiat.u1(x104))) + x107, x108 := bits.add_u64(x85, x99, u64(0x0)) + x109, x110 := bits.add_u64(x87, x101, u64(fiat.u1(x108))) + x111, x112 := bits.add_u64(x89, x103, u64(fiat.u1(x110))) + x113, x114 := bits.add_u64(x91, x105, u64(fiat.u1(x112))) + _, x115 := bits.mul_u64(x107, 0xccd1c8aaee00bc4f) + x118, x117 := bits.mul_u64(x115, 0xffffffff00000000) + x120, x119 := bits.mul_u64(x115, 0xffffffffffffffff) + x122, x121 := bits.mul_u64(x115, 0xbce6faada7179e84) + x124, x123 := bits.mul_u64(x115, 0xf3b9cac2fc632551) + x125, x126 := bits.add_u64(x124, x121, u64(0x0)) + x127, x128 := bits.add_u64(x122, x119, u64(fiat.u1(x126))) + x129, x130 := bits.add_u64(x120, x117, u64(fiat.u1(x128))) + _, x132 := bits.add_u64(x107, x123, u64(0x0)) + x133, x134 := bits.add_u64(x109, x125, u64(fiat.u1(x132))) + x135, x136 := bits.add_u64(x111, x127, u64(fiat.u1(x134))) + x137, x138 := bits.add_u64(x113, x129, u64(fiat.u1(x136))) + x139, x140 := bits.add_u64(((u64(fiat.u1(x114)) + u64(fiat.u1(x92))) + (u64(fiat.u1(x106)) + x94)), (u64(fiat.u1(x130)) + x118), u64(fiat.u1(x138))) + x142, x141 := bits.mul_u64(x3, 0x66e12d94f3d95620) + x144, x143 := bits.mul_u64(x3, 0x2845b2392b6bec59) + x146, x145 := bits.mul_u64(x3, 0x4699799c49bd6fa6) + x148, x147 := bits.mul_u64(x3, 0x83244c95be79eea2) + x149, x150 := bits.add_u64(x148, x145, u64(0x0)) + x151, x152 := bits.add_u64(x146, x143, u64(fiat.u1(x150))) + x153, x154 := bits.add_u64(x144, x141, u64(fiat.u1(x152))) + x155, x156 := bits.add_u64(x133, x147, u64(0x0)) + x157, x158 := bits.add_u64(x135, x149, u64(fiat.u1(x156))) + x159, x160 := bits.add_u64(x137, x151, u64(fiat.u1(x158))) + x161, x162 := bits.add_u64(x139, x153, u64(fiat.u1(x160))) + _, x163 := bits.mul_u64(x155, 0xccd1c8aaee00bc4f) + x166, x165 := bits.mul_u64(x163, 0xffffffff00000000) + x168, x167 := bits.mul_u64(x163, 0xffffffffffffffff) + x170, x169 := bits.mul_u64(x163, 0xbce6faada7179e84) + x172, x171 := bits.mul_u64(x163, 0xf3b9cac2fc632551) + x173, x174 := bits.add_u64(x172, x169, u64(0x0)) + x175, x176 := bits.add_u64(x170, x167, u64(fiat.u1(x174))) + x177, x178 := bits.add_u64(x168, x165, u64(fiat.u1(x176))) + _, x180 := bits.add_u64(x155, x171, u64(0x0)) + x181, x182 := bits.add_u64(x157, x173, u64(fiat.u1(x180))) + x183, x184 := bits.add_u64(x159, x175, u64(fiat.u1(x182))) + x185, x186 := bits.add_u64(x161, x177, u64(fiat.u1(x184))) + x187, x188 := bits.add_u64(((u64(fiat.u1(x162)) + u64(fiat.u1(x140))) + (u64(fiat.u1(x154)) + x142)), (u64(fiat.u1(x178)) + x166), u64(fiat.u1(x186))) + x189, x190 := bits.sub_u64(x181, 0xf3b9cac2fc632551, u64(0x0)) + x191, x192 := bits.sub_u64(x183, 0xbce6faada7179e84, u64(fiat.u1(x190))) + x193, x194 := bits.sub_u64(x185, 0xffffffffffffffff, u64(fiat.u1(x192))) + x195, x196 := bits.sub_u64(x187, 0xffffffff00000000, u64(fiat.u1(x194))) + _, x198 := bits.sub_u64(u64(fiat.u1(x188)), u64(0x0), u64(fiat.u1(x196))) + x199 := fiat.cmovznz_u64(fiat.u1(x198), x189, x181) + x200 := fiat.cmovznz_u64(fiat.u1(x198), x191, x183) + x201 := fiat.cmovznz_u64(fiat.u1(x198), x193, x185) + x202 := fiat.cmovznz_u64(fiat.u1(x198), x195, x187) + out1[0] = x199 + out1[1] = x200 + out1[2] = x201 + out1[3] = x202 +} diff --git a/core/crypto/_weierstrass/fe.odin b/core/crypto/_weierstrass/fe.odin new file mode 100644 index 00000000000..61438dbcdfe --- /dev/null +++ b/core/crypto/_weierstrass/fe.odin @@ -0,0 +1,135 @@ +package _weierstrass + +import "core:math/bits" +import p256r1 "core:crypto/_fiat/field_p256r1" + +Field_Element_p256r1 :: p256r1.Montgomery_Domain_Field_Element + +FE_SIZE_P256R1 :: 32 + +fe_clear :: proc { + p256r1.fe_clear, +} + +fe_clear_vec :: proc { + p256r1.fe_clear_vec, +} + +fe_set_bytes :: proc { + p256r1.fe_from_bytes, +} +fe_bytes :: proc { + p256r1.fe_to_bytes, +} + +fe_set :: proc { + p256r1.fe_set, +} + +fe_zero :: proc { + p256r1.fe_zero, +} + +fe_a :: proc { + fe_a_p256r1, +} + +fe_b :: proc { + fe_b_p256r1, +} + +fe_gen_x :: proc { + fe_gen_x_p256r1, +} + +fe_gen_y :: proc { + fe_gen_y_p256r1, +} + +fe_one :: proc { + p256r1.fe_one, +} + +fe_add :: proc { + p256r1.fe_add, +} + +fe_sub :: proc { + p256r1.fe_sub, +} + +fe_negate :: proc { + p256r1.fe_opp, +} + +fe_mul :: proc { + p256r1.fe_mul, +} + +fe_square :: proc { + p256r1.fe_square, +} + +fe_inv :: proc { + p256r1.fe_inv, +} + +fe_sqrt :: proc { + p256r1.fe_sqrt, +} + +fe_equal :: proc { + p256r1.fe_equal, +} + +fe_is_odd :: proc { + p256r1.fe_is_odd, +} + +fe_is_zero :: proc { + fe_is_zero_p256r1, +} + +fe_cond_select :: proc { + p256r1.fe_cond_select, +} + +fe_a_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) { + // a = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc + // = -3 mod p + fe[0] = 18158513697507508224 + fe[1] = 144115188059078655 + fe[2] = 72057589793292288 + fe[3] = 216172786492637184 +} + +fe_b_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) { + // b = 0x5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b + fe[0] = 2979529574677938707 + fe[1] = 16443271603273783221 + fe[2] = 7662178891097169697 + fe[3] = 4322198778257916024 +} + +fe_gen_x_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) { + // G_x = 0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296 + fe[0] = 433742654557116119 + fe[1] = 15796249253281334771 + fe[2] = 17272630429700079060 + fe[3] = 2389116281086019998 +} + +fe_gen_y_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) { + // G_y = 0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5 + fe[0] = 7870488993788815934 + fe[1] = 5773808706922869196 + fe[2] = 7471191399242744833 + fe[3] = 12943927140747999588 +} + +@(require_results) +fe_is_zero_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) -> int { + ctrl := p256r1.fe_non_zero(fe) + _, borrow := bits.sub_u64(ctrl, 1, 0) + return int(borrow) +} diff --git a/core/crypto/_weierstrass/point.odin b/core/crypto/_weierstrass/point.odin new file mode 100644 index 00000000000..371725cdb00 --- /dev/null +++ b/core/crypto/_weierstrass/point.odin @@ -0,0 +1,549 @@ +package _weierstrass + +/* +This implements prime order short Weierstrass curves defined over a field +k with char(k) != 2, 3 (`y^2 = x^3 + ax + b`). for the purpose of +implementing ECDH and ECDSA. Use of this package for other purposes is +NOT RECOMMENDED. + +As an explicit simplicity/performance tradeoff, projective representation +was chosen so that it is possible to use the complete addition +formulas. + +See: +- https://eprint.iacr.org/2015/1060.pdf +- https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html + +WARNING: The point addition and doubling formulas are specialized for +`a = -3`, which covers secp256r1, secp384r1, secp521r1, FRP256v1, SM2, +and GOST 34.10. The brainpool curves and secp256k1 are NOT SUPPORTED +and would require slightly different formulas. +*/ + +Point_p256r1 :: struct { + _x: Field_Element_p256r1, + _y: Field_Element_p256r1, + _z: Field_Element_p256r1, +} + +@(require_results) +pt_set_xy_bytes :: proc "contextless" (p: ^$T, x_raw, y_raw: []byte) -> bool { + when T == Point_p256r1 { + FE_SZ :: FE_SIZE_P256R1 + x, y: Field_Element_p256r1 + defer fe_clear_vec([]^Field_Element_p256r1{&x, &y}) + } else { + #panic("weierstrass: invalid curve") + } + + if len(x_raw) != FE_SZ || len(y_raw) != FE_SZ { + return false + } + + if !fe_set_bytes(&x, x_raw) { + return false + } + if !fe_set_bytes(&y, y_raw) { + return false + } + if !is_on_curve(&x, &y) { + return false + } + + fe_set(&p._x, &x) + fe_set(&p._y, &y) + fe_one(&p._z) + + return true +} + +@(require_results) +pt_set_x_bytes :: proc "contextless" (p: ^$T, x_raw: []byte, y_is_odd: int) -> bool { + when T == Point_p256r1 { + FE_SZ :: FE_SIZE_P256R1 + x, y, yy, y_neg: Field_Element_p256r1 + defer fe_clear_vec([]^Field_Element_p256r1{&x, &y, &yy, &y_neg}) + } else { + #panic("weierstrass: invalid curve") + } + + if len(x_raw) != FE_SZ { + return false + } + + if !fe_set_bytes(&x, x_raw) { + return false + } + set_yy_candidate(&yy, &x) + if fe_sqrt(&y, &yy) != 1 { + return false + } + + // Pick the correct y-coordinate. + fe_negate(&y_neg, &y) + parity_neq := (y_is_odd ~ fe_is_odd(&y)) & 1 + + fe_set(&p._x, &x) + fe_cond_select(&p._y, &y, &y_neg, parity_neq) + fe_one(&p._z) + + return true +} + +@(require_results) +pt_bytes :: proc "contextless" (x, y: []byte, p: ^$T) -> bool { + when T == Point_p256r1 { + FE_SZ :: FE_SIZE_P256R1 + } else { + #panic("weierstrass: invalid curve") + } + + if pt_is_identity(p) == 1 { + return false + } + + // Convert to affine coordinates. + pt_rescale(p, p) + + switch len(x) { + case 0: + case FE_SZ: + fe_bytes(x, &p._x) + case: + panic_contextless("weierstrass: invalid x buffer") + } + switch len(y) { + case 0: + case FE_SZ: + fe_bytes(y, &p._y) + case: + panic_contextless("weierstrass: invalid y buffer") + } + + return true +} + +pt_set :: proc "contextless" (p, q: ^$T) { + fe_set(&p._x, &q._x) + fe_set(&p._y, &q._y) + fe_set(&p._z, &q._z) +} + +pt_identity :: proc "contextless" (p: ^$T) { + fe_zero(&p._x) + fe_one(&p._y) + fe_zero(&p._z) +} + +pt_generator :: proc "contextless" (p: ^$T) { + fe_gen_x(&p._x) + fe_gen_y(&p._y) + fe_one(&p._z) +} + +pt_clear :: proc "contextless" (p: ^$T) { + fe_clear(&p._x) + fe_clear(&p._y) + fe_clear(&p._z) +} + +pt_clear_vec :: proc "contextless" (arg: []^$T) { + for p in arg { + pt_clear(p) + } +} + +pt_add :: proc "contextless" (p, a, b: ^$T) { + // Algorithm 4 from "Complete addition formulas for prime + // order elliptic curves" by Renes, Costello, and Batina. + // + // The formula is complete in that it is valid for all a and b, + // without exceptions or extra assumptions about the inputs. + // + // The operation costs are `12M + 2mb + 29a`. + + when T == Point_p256r1 { + t0, t1, t2, t3, t4, b_fe: Field_Element_p256r1 + x3, y3, z3: Field_Element_p256r1 + defer fe_clear_vec([]^Field_Element_p256r1{&t0, &t1, &t2, &t3, &t4, &x3, &y3, &z3}) + } else { + #panic("weierstrass: invalid curve") + } + + x1, y1, z1 := &a._x, &a._y, &a._z + x2, y2, z2 := &b._x, &b._y, &b._z + + fe_b(&b_fe) + + // t0 := X1 * X2 ; t1 := Y1 * Y2 ; t2 := Z1 * Z2 ; + fe_mul(&t0, x1, x2) + fe_mul(&t1, y1, y2) + fe_mul(&t2, z1, z2) + + // t3 := X1 + Y1 ; t4 := X2 + Y2 ; t3 := t3 * t4 ; + fe_add(&t3, x1, y1) + fe_add(&t4, x2, y2) + fe_mul(&t3, &t3, &t4) + + // t4 := t0 + t1 ; t3 := t3 - t4 ; t4 := Y1 + Z1 ; + fe_add(&t4, &t0, &t1) + fe_sub(&t3, &t3, &t4) + fe_add(&t4, y1, z1) + + // X3 := Y2 + Z2 ; t4 := t4 * X3 ; X3 := t1 + t2 ; + fe_add(&x3, y2, z2) + fe_mul(&t4, &t4, &x3) + fe_add(&x3, &t1, &t2) + + // t4 := t4 - X3 ; X3 := X1 + Z1 ; Y3 := X2 + Z2 ; + fe_sub(&t4, &t4, &x3) + fe_add(&x3, x1, z1) + fe_add(&y3, x2, z2) + + // X3 := X3 * Y3 ; Y3 := t0 + t2 ; Y3 := X3 - Y3 ; + fe_mul(&x3, &x3, &y3) + fe_add(&y3, &t0, &t2) + fe_sub(&y3, &x3, &y3) + + // Z3 := b * t2 ; X3 := Y3 - Z3 ; Z3 := X3 + X3 ; + fe_mul(&z3, &b_fe, &t2) + fe_sub(&x3, &y3, &z3) + fe_add(&z3, &x3, &x3) + + // X3 := X3 + Z3 ; Z3 := t1 - X3 ; X3 := t1 + X3 ; + fe_add(&x3, &x3, &z3) + fe_sub(&z3, &t1, &x3) + fe_add(&x3, &t1, &x3) + + // Y3 := b * Y3 ; t1 := t2 + t2 ; t2 := t1 + t2 ; + fe_mul(&y3, &b_fe, &y3) + fe_add(&t1, &t2, &t2) + fe_add(&t2, &t1, &t2) + + // Y3 := Y3 - t2 ; Y3 := Y3 - t0 ; t1 := Y3 + Y3 ; + fe_sub(&y3, &y3, &t2) + fe_sub(&y3, &y3, &t0) + fe_add(&t1, &y3, &y3) + + // Y3 := t1 + Y3 ; t1 := t0 + t0 ; t0 := t1 + t0 ; + fe_add(&y3, &t1, &y3) + fe_add(&t1, &t0, &t0) + fe_add(&t0, &t1, &t0) + + // t0 := t0 - t2 ; t1 := t4 * Y3 ; t2 := t0 * Y3 ; + fe_sub(&t0, &t0, &t2) + fe_mul(&t1, &t4, &y3) + fe_mul(&t2, &t0, &y3) + + // Y3 := X3 * Z3 ; Y3 := Y3 + t2 ; X3 := t3 * X3 ; + fe_mul(&y3, &x3, &z3) + fe_add(&y3, &y3, &t2) + fe_mul(&x3, &t3, &x3) + + // X3 := X3 - t1 ; Z3 := t4 * Z3 ; t1 := t3 * t0 ; + fe_sub(&x3, &x3, &t1) + fe_mul(&z3, &t4, &z3) + fe_mul(&t1, &t3, &t0) + + // Z3 := Z3 + t1 ; + fe_add(&z3, &z3, &t1) + + // return X3 , Y3 , Z3 ; + fe_set(&p._x, &x3) + fe_set(&p._y, &y3) + fe_set(&p._z, &z3) +} + +@(private) +pt_add_mixed :: proc "contextless" (p, a, b: ^$T) { + // Algorithm 5 from "Complete addition formulas for prime + // order elliptic curves" by Renes, Costello, and Batina. + // + // The formula is mixed in that it assumes the z-coordinate + // of the addend (`Z2`) is `1`, meaning that it CAN NOT + // handle the addend being the point at infinity. + // + // The operation costs are `11M + 2mb + 23a` saving + // `1M + 6a` over `pt_add`. + + when T == Point_p256r1 { + t0, t1, t2, t3, t4, b_fe: Field_Element_p256r1 + x3, y3, z3: Field_Element_p256r1 + defer fe_clear_vec([]^Field_Element_p256r1{&t0, &t1, &t2, &t3, &t4, &x3, &y3, &z3}) + } else { + #panic("weierstrass: invalid curve") + } + + x1, y1, z1 := &a._x, &a._y, &a._z + x2, y2 := &b._x, &b._y + + fe_b(&b_fe) + + // t0 := X1 * X2 ; t1 := Y1 * Y2 ; t3 := X2 + Y2 ; + fe_mul(&t0, x1, x2) + fe_mul(&t1, y1, y2) + fe_add(&t3, x2, y2) + + // t4 := X1 + Y1 ; t3 := t3 * t4 ; t4 := t0 + t1 ; + fe_add(&t4, x1, y1) + fe_mul(&t3, &t3, &t4) + fe_add(&t4, &t0, &t1) + + // t3 := t3 − t4 ; t4 := Y2 * Z1 ; t4 := t4 + Y1 ; + fe_sub(&t3, &t3, &t4) + fe_mul(&t4, y2, z1) + fe_add(&t4, &t4, y1) + + // Y3 := X2 * Z1 ; Y3 := Y3 + X1 ; Z3 := b * Z1 ; + fe_mul(&y3, x2, z1) + fe_add(&y3, &y3, x1) + fe_mul(&z3, &b_fe, z1) + + // X3 := Y3 − Z3 ; Z3 := X3 + X3 ; X3 := X3 + Z3 ; + fe_sub(&x3, &y3, &z3) + fe_add(&z3, &x3, &x3) + fe_add(&x3, &x3, &z3) + + // Z3 := t1 − X3 ; X3 := t1 + X3 ;. Y3 := b * Y3 ; + fe_sub(&z3, &t1, &x3) + fe_add(&x3, &t1, &x3) + fe_mul(&y3, &b_fe, &y3) + + // t1 := Z1 + Z1 ; t2 := t1 + Z1 ; Y3 := Y3 − t2 ; + fe_add(&t1, z1, z1) + fe_add(&t2, &t1, z1) + fe_sub(&y3, &y3, &t2) + + // Y3 := Y3 − t0 ; t1 := Y3 + Y3 ; Y3 := t1 + Y3 ; + fe_sub(&y3, &y3, &t0) + fe_add(&t1, &y3, &y3) + fe_add(&y3, &t1, &y3) + + // t1 := t0 + t0 ; t0 := t1 + t0 ; t0 := t0 − t2 ; + fe_add(&t1, &t0, &t0) + fe_add(&t0, &t1, &t0) + fe_sub(&t0, &t0, &t2) + + // t1 := t4 * Y3 ; t2 := t0 * Y3 ; Y3 := X3 * Z3 ; + fe_mul(&t1, &t4, &y3) + fe_mul(&t2, &t0, &y3) + fe_mul(&y3, &x3, &z3) + + // Y3 := Y3 + t2 ; X3 := t3 * X3 ; X3 := X3 − t1 ; + fe_add(&y3, &y3, &t2) + fe_mul(&x3, &t3, &x3) + fe_sub(&x3, &x3, &t1) + + // Z3 := t4 * Z3 ; t1 := t3 * t0 ; Z3 := Z3 + t1 ; + fe_mul(&z3, &t4, &z3) + fe_mul(&t1, &t3, &t0) + fe_add(&z3, &z3, &t1) + + // return X3 , Y3 , Z3 ; + fe_set(&p._x, &x3) + fe_set(&p._y, &y3) + fe_set(&p._z, &z3) +} + +pt_double :: proc "contextless" (p, a: ^$T) { + // Algorithm 6 from "Complete addition formulas for prime + // order elliptic curves" by Renes, Costello, and Batina. + // + // The formula is complete in that it is valid for all a, + // without exceptions or extra assumptions about the inputs. + // + // The operation costs are `8M + 3S + 2mb + 21a`. + + when T == Point_p256r1 { + t0, t1, t2, t3, b_fe: Field_Element_p256r1 + x3, y3, z3: Field_Element_p256r1 + defer fe_clear_vec([]^Field_Element_p256r1{&t0, &t1, &t2, &t3, &x3, &y3, &z3}) + } else { + #panic("weierstrass: invalid curve") + } + + x, y, z := &a._x, &a._y, &a._z + + fe_b(&b_fe) + + // t0 := X ^2; t1 := Y ^2; t2 := Z ^2; + fe_square(&t0, x) + fe_square(&t1, y) + fe_square(&t2, z) + + // t3 := X * Y ; t3 := t3 + t3 ; Z3 := X * Z ; + fe_mul(&t3, x, y) + fe_add(&t3, &t3, &t3) + fe_mul(&z3, x, z) + + // Z3 := Z3 + Z3 ; Y3 := b * t2 ; Y3 := Y3 - Z3 ; + fe_add(&z3, &z3, &z3) + fe_mul(&y3, &b_fe, &t2) + fe_sub(&y3, &y3, &z3) + + // X3 := Y3 + Y3 ; Y3 := X3 + Y3 ; X3 := t1 - Y3 ; + fe_add(&x3, &y3, &y3) + fe_add(&y3, &x3, &y3) + fe_sub(&x3, &t1, &y3) + + // Y3 := t1 + Y3 ; Y3 := X3 * Y3 ; X3 := X3 * t3 ; + fe_add(&y3, &t1, &y3) + fe_mul(&y3, &x3, &y3) + fe_mul(&x3, &x3, &t3) + + // t3 := t2 + t2 ; t2 := t2 + t3 ; Z3 := b * Z3 ; + fe_add(&t3, &t2, &t2) + fe_add(&t2, &t2, &t3) + fe_mul(&z3, &b_fe, &z3) + + // Z3 := Z3 - t2 ; Z3 := Z3 - t0 ; t3 := Z3 + Z3 ; + fe_sub(&z3, &z3, &t2) + fe_sub(&z3, &z3, &t0) + fe_add(&t3, &z3, &z3) + + // Z3 := Z3 + t3 ; t3 := t0 + t0 ; t0 := t3 + t0 ; + fe_add(&z3, &z3, &t3) + fe_add(&t3, &t0, &t0) + fe_add(&t0, &t3, &t0) + + // t0 := t0 - t2 ; t0 := t0 * Z3 ; Y3 := Y3 + t0 ; + fe_sub(&t0, &t0, &t2) + fe_mul(&t0, &t0, &z3) + fe_add(&y3, &y3, &t0) + + // t0 := Y * Z ; t0 := t0 + t0 ; Z3 := t0 * Z3 ; + fe_mul(&t0, y, z) + fe_add(&t0, &t0, &t0) + fe_mul(&z3, &t0, &z3) + + // X3 := X3 - Z3 ; Z3 := t0 * t1 ; Z3 := Z3 + Z3 ; + fe_sub(&x3, &x3, &z3) + fe_mul(&z3, &t0, &t1) + fe_add(&z3, &z3, &z3) + + // Z3 := Z3 + Z3 ; + fe_add(&z3, &z3, &z3) + + // return X3 , Y3 , Z3 ; + fe_set(&p._x, &x3) + fe_set(&p._y, &y3) + fe_set(&p._z, &z3) + + #panic("FIXME: Incorrect?") +} + +pt_sub :: proc "contextless" (p, a, b: ^$T) { + b_neg: T + pt_negate(&b_neg, b) + pt_add(p, a, &b_neg) + + fe_clear(&b_neg) +} + +pt_negate :: proc "contextless" (p, a: ^$T) { + fe_set(&p._x, &a._x) + fe_negate(&p._y, &a._y) + fe_set(&p._z, &a._z) +} + +pt_rescale :: proc "contextless" (p, a: ^$T) { + // A = 1/Z1 + // X3 = A*X1 + // Y3 = A*Y1 + // Z3 = 1 + // + // As per "From A to Z: Projective coordinates leakage in the wild" + // leaking the Z-coordinate is bad. The modular inversion algorithm + // used in this library is based on Fermat's Little Theorem. + // + // See: https://eprint.iacr.org/2020/432.pdf + + was_identity := pt_is_identity(a) + + when T == Point_p256r1 { + z_inv: Field_Element_p256r1 + } else { + #panic("weierstrass: invalid curve") + } + + ident: T + fe_inv(&z_inv, &a._z) + fe_mul(&p._x, &a._x, &z_inv) + fe_mul(&p._y, &a._y, &z_inv) + fe_one(&p._z) + + pt_identity(&ident) + pt_cond_select(p, p, &ident, was_identity) + + fe_clear(&z_inv) +} + +pt_cond_select :: proc "contextless" (p, a, b: ^$T, ctrl: int) { + fe_cond_select(&p._x, &a._x, &b._x, ctrl) + fe_cond_select(&p._y, &a._y, &b._y, ctrl) + fe_cond_select(&p._z, &a._z, &b._z, ctrl) +} + +@(require_results) +pt_equal :: proc "contextless" (a, b: ^$T) -> int { + when T == Point_p256r1 { + x1z2, x2z1, y1z2, y2z1: Field_Element_p256r1 + } else { + #panic("weierstrass: invalid curve") + } + + // Check X1Z2 == X2Z1 && Y1Z2 == Y2Z1 + fe_mul(&x1z2, &a._x, &b._z) + fe_mul(&x2z1, &b._x, &a._z) + + fe_mul(&y1z2, &a._y, &b._z) + fe_mul(&y2z1, &b._y, &a._z) + + return fe_equal(&x1z2, &x2z1) & fe_equal(&y1z2, &y2z1) +} + +@(require_results) +pt_is_identity :: proc "contextless" (p: ^$T) -> int { + return fe_is_zero(&p._z) +} + +@(require_results) +pt_is_y_odd :: proc "contextless" (p: ^$T) -> int { + tmp: T + defer pt_clear(&tmp) + + fe_set(&tmp, p) + pt_rescale(&tmp) + + return fe_is_odd(&tmp._y) +} + +@(private) +is_on_curve :: proc "contextless" (x, y: ^$T) -> bool { + maybe_yy, yy: T + defer fe_clear_vec([]^T{&maybe_yy, &yy}) + + // RHS: x^3 + ax + b + set_yy_candidate(&maybe_yy, x) + + // LHS: y^2 + fe_square(&yy, y) + + return fe_equal(&maybe_yy, &yy) == 1 +} + +@(private) +set_yy_candidate :: proc "contextless" (maybe_yy, x: ^$T) { + // RHS: x^3 + ax + b + tmp: T + + fe_square(maybe_yy, x) + fe_mul(maybe_yy, maybe_yy, x) + + fe_a(&tmp) + fe_mul(&tmp, &tmp, x) + fe_add(maybe_yy, maybe_yy, &tmp) + + fe_b(&tmp) + fe_add(maybe_yy, maybe_yy, &tmp) +} diff --git a/core/crypto/_weierstrass/point_s11n_sec.odin b/core/crypto/_weierstrass/point_s11n_sec.odin new file mode 100644 index 00000000000..42cd9d2273b --- /dev/null +++ b/core/crypto/_weierstrass/point_s11n_sec.odin @@ -0,0 +1,96 @@ +package _weierstrass + +import "core:mem" + +@(private) +SEC_PREFIX_IDENTITY :: 0x00 +@(private) +SEC_PREFIX_COMPRESSED_EVEN :: 0x02 +@(private) +SEC_PREFIX_COMPRESSED_ODD :: 0x03 +@(private) +SEC_PREFIX_UNCOMPRESSED :: 0x04 + +@(require_results) +pt_set_sec_bytes :: proc /* "contextless" */ (p: ^$T, b: []byte) -> bool { + when T == Point_p256r1 { + FE_SZ :: FE_SIZE_P256R1 + } else { + #panic("weierstrass: invalid curve") + } + + b_len := len(b) + if b_len < 1 { + return false + } + + switch b[0] { + case SEC_PREFIX_IDENTITY: + if b_len != 1 { + return false + } + pt_identity(p) + return true + case SEC_PREFIX_COMPRESSED_EVEN, SEC_PREFIX_COMPRESSED_ODD: + if b_len != 1 + FE_SZ { + return false + } + y_is_odd := b[0] - SEC_PREFIX_COMPRESSED_EVEN + return pt_set_x_bytes(p, b[1:], int(y_is_odd)) + case SEC_PREFIX_UNCOMPRESSED: + if b_len != 1 + 2 * FE_SZ { + return false + } + x, y := b[1:1+FE_SZ], b[1+FE_SZ:] + return pt_set_xy_bytes(p, x, y) + case: + return false + } +} + +@(require_results) +pt_sec_bytes :: proc "contextless" (b: []byte, p: ^$T, compressed: bool) -> bool { + when T == Point_p256r1 { + FE_SZ :: FE_SIZE_P256R1 + } else { + #panic("weierstrass: invalid curve") + } + + b_len := len(b) + if pt_is_identity(p) == 1 { + if b_len != 1 { + return false + } + b[0] = SEC_PREFIX_IDENTITY + return true + } + + x, y: []byte + y_: [FE_SZ]byte + switch compressed { + case true: + if b_len != 1 + FE_SZ { + return false + } + x, y = b[1:], y_[:] + case false: + if b_len != 1 + 2 * FE_SZ { + return false + } + b[0]= SEC_PREFIX_UNCOMPRESSED + x, y = b[1:1+FE_SZ], b[1+FE_SZ:] + } + if !pt_bytes(x, y, p) { + return false + } + if compressed { + // Instead of calling pt_is_y_odd, just serializing + // y into a temp buffer and checking the parity saves + // 1 redundant rescale call. + y_is_odd := byte(y[FE_SZ-1] & 1) + b[0] = SEC_PREFIX_COMPRESSED_EVEN + y_is_odd + mem.zero_explicit(&y_, size_of(y_)) + } + + return true +} diff --git a/core/crypto/_weierstrass/sc.odin b/core/crypto/_weierstrass/sc.odin new file mode 100644 index 00000000000..c22e0cb77da --- /dev/null +++ b/core/crypto/_weierstrass/sc.odin @@ -0,0 +1,66 @@ +package _weierstrass + +import p256r1 "core:crypto/_fiat/field_scalarp256r1" + +Scalar_p256r1 :: p256r1.Montgomery_Domain_Field_Element + +SC_SIZE_P256R1 :: 32 + +sc_clear :: proc { + p256r1.fe_clear, +} + +sc_clear_vec :: proc { + p256r1.fe_clear_vec, +} + +sc_set_bytes :: proc { + p256r1.fe_from_bytes, +} +sc_bytes :: proc { + p256r1.fe_to_bytes, +} + +sc_set :: proc { + p256r1.fe_set, +} + +sc_zero :: proc { + p256r1.fe_zero, +} + +sc_one_p256r1 :: proc { + p256r1.fe_one, +} + +sc_add :: proc { + p256r1.fe_add, +} + +sc_sub :: proc { + p256r1.fe_sub, +} + +sc_negate :: proc { + p256r1.fe_opp, +} + +sc_mul :: proc { + p256r1.fe_mul, +} + +sc_square :: proc { + p256r1.fe_square, +} + +sc_cond_assign :: proc { + p256r1.fe_cond_assign, +} + +sc_equal :: proc { + p256r1.fe_equal, +} + +sc_is_odd :: proc { + p256r1.fe_is_odd, +} diff --git a/core/crypto/_weierstrass/scalar_mul.odin b/core/crypto/_weierstrass/scalar_mul.odin new file mode 100644 index 00000000000..15b3b77ce51 --- /dev/null +++ b/core/crypto/_weierstrass/scalar_mul.odin @@ -0,0 +1,37 @@ +package _weierstrass + +import "core:mem" +import "core:slice" + +// TODO/perf: Rewrite this to use a windowed multiply. +pt_scalar_mul :: proc /* "contextless" */ (p, a: ^$T, s: ^$S) { + when T == Point_p256r1 && S == Scalar_p256r1 { + FE_SZ :: FE_SIZE_P256R1 + SC_SZ :: SC_SIZE_P256R1 + q, id, addend: Point_p256r1 + } else { + #panic("weierstrass: invalid curve") + } + + // Naive constant-time double-and-add. + pt_set(&q, a) + pt_identity(p) + pt_identity(&id) + + b: [SC_SZ]byte + sc_bytes(b[:], s) + slice.reverse(b[:]) + + N_BITS :: SC_SZ * 8 + for i := 0; i < N_BITS; i = i + 1 { + b_ := (b[i/8] >> uint(i % 8)) & 1 + pt_cond_select(&addend, &id, &q, int(b_)) + + pt_add(p, p, &addend) + pt_add(&q, &q, &q) + // pt_double(&q, &q) + } + mem.zero_explicit(&b, size_of(b)) + + pt_clear_vec([]^T{&q, &addend}) +}