ARM support.

data61 · Apr 19, 2021 · 0f656fa · 0f656fa
1 parent 6c89808
commit 0f656fa
Show file tree

Hide file tree

Showing 112 changed files with 1,735 additions and 474 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "Programs/Circuits"]
 	path = Programs/Circuits
 	url = https://github.com/mkskeller/bristol-fashion
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde
diff --git a/BMR/Key.h b/BMR/Key.h
@@ -7,11 +7,10 @@
 #define COMMON_INC_KEY_H_
 
 #include <iostream>
-#include <emmintrin.h>
-#include <smmintrin.h>
 #include <string.h>
 
 #include "Tools/FlexBuffer.h"
+#include "Tools/intrinsics.h"
 #include "Math/gf2nlong.h"
 
 using namespace std;

diff --git a/BMR/Party.cpp b/BMR/Party.cpp
@@ -371,7 +371,7 @@ void FakeProgramParty::receive_spdz_wires(ReceivedMsg& msg)
 		spdz_mac_key.unpack(spdz_wires[op].back());
 		if (!MC)
 		{
-			MC = new Passing_MAC_Check<Share<gf2n_long>>(spdz_mac_key);
+			MC = new MAC_Check_<Share<gf2n_long>>(spdz_mac_key);
 			cout << "MAC key: " << hex << spdz_mac_key << endl;
 			mac_key = spdz_mac_key;
 		}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 The changelog explains changes pulled through from the private development repository. Bug fixes and small enhancements are committed between releases and not documented here.
 
+## 0.2.4 (Apr 19, 2021)
+
+- ARM support
+- Base OTs optionally without SimpleOT/AVX
+- Use OpenSSL instead of Crypto++ for elliptic curves
+- Post-sacrifice binary computation with replicated secret sharing similar
+  to [Araki et al.](https://www.ieee-security.org/TC/SP2017/papers/96.pdf)
+- More flexible multithreading
+
 ## 0.2.3 (Feb 23, 2021)
 
 - Distributed key generation for homomorphic encryption with active security similar to [Rotaru et al.](https://eprint.iacr.org/2019/1300)

diff --git a/CONFIG b/CONFIG
@@ -3,7 +3,6 @@ ROOT = .
 OPTIM= -O3
 #PROF = -pg
 #DEBUG = -DDEBUG
-#MEMPROTECT = -DMEMPROTECT
 GDEBUG = -g
 
 # set this to your preferred local storage directory
@@ -12,8 +11,8 @@ PREP_DIR = '-DPREP_DIR="Player-Data/"'
 # set for SHE preprocessing (SPDZ and Overdrive)
 USE_NTL = 0
 
-# set for using GF(2^128) online phase, OT, MASCOT, or BMR
-# unset for GF(2^40) online and offline phase
+# set for using GF(2^128)
+# unset for GF(2^40)
 USE_GF2N_LONG = 1
 
 # set to -march=<architecture> for optimization
@@ -28,6 +27,24 @@ USE_GF2N_LONG = 1
 ARCH = -mtune=native -msse4.1 -msse4.2 -maes -mpclmul -mavx -mavx2 -mbmi2 -madx
 ARCH = -march=native
 
+MACHINE := $(shell uname -m)
+OS := $(shell uname -s)
+ifeq ($(MACHINE), x86_64)
+# set this to 0 to avoid using AVX for OT
+ifeq ($(OS), Linux)
+CHECK_AVX := $(shell grep -q avx /proc/cpuinfo; echo $$?)
+ifeq ($(CHECK_AVX), 0)
+AVX_OT = 1
+else
+AVX_OT = 0
+endif
+else
+AVX_OT = 1
+endif
+else
+AVX_OT = 0
+endif
+
 # allow to set compiler in CONFIG.mine
 CXX = g++
 
@@ -38,6 +55,10 @@ ifeq ($(USE_GF2N_LONG),1)
 GF2N_LONG = -DUSE_GF2N_LONG
 endif
 
+ifeq ($(AVX_OT), 0)
+CFLAGS += -DNO_AVX_OT
+endif
+
 # MAX_MOD_SZ (for FHE) must be least and GFP_MOD_SZ (for computation)
 # must be exactly ceil(len(p)/len(word)) for the relevant prime p
 # GFP_MOD_SZ only needs to be set for primes of bit length more that 256.
@@ -51,7 +72,6 @@ ifeq ($(USE_NTL),1)
 LDLIBS := -lntl $(LDLIBS)
 endif
 
-OS := $(shell uname -s)
 ifeq ($(OS), Linux)
 LDLIBS += -lrt
 endif
@@ -62,12 +82,10 @@ else
 BOOST = -lboost_thread $(MY_BOOST)
 endif
 
-CFLAGS += $(ARCH) $(MY_CFLAGS) $(GDEBUG) -Wextra -Wall $(OPTIM) -I$(ROOT) -pthread $(PROF) $(DEBUG) $(MOD) $(MEMPROTECT) $(GF2N_LONG) $(PREP_DIR) $(SECURE) -std=c++11 -Werror
+CFLAGS += $(ARCH) $(MY_CFLAGS) $(GDEBUG) -Wextra -Wall $(OPTIM) -I$(ROOT) -pthread $(PROF) $(DEBUG) $(MOD) $(GF2N_LONG) $(PREP_DIR) $(SECURE) -std=c++11 -Werror
 CPPFLAGS = $(CFLAGS)
 LD = $(CXX)
 
-ECLIB = -lcryptopp
-
 ifeq ($(OS), Darwin)
 ifeq ($(USE_NTL),1)
 CFLAGS += -Wno-error=unused-parameter

diff --git a/Compiler/GC/types.py b/Compiler/GC/types.py
@@ -284,7 +284,7 @@ class sbits(bits):
     Instances can be also be initalized from :py:obj:`~Compiler.types.regint`
     and :py:obj:`~Compiler.types.sint`.
     """
-    max_length = 128
+    max_length = 64
     reg_type = 'sb'
     is_clear = False
     clear_type = cbits

diff --git a/Compiler/comparison.py b/Compiler/comparison.py
@@ -190,6 +190,8 @@ def TruncLeakyInRing(a, k, m, signed):
     Returns a >> m.
     Requires a < 2^k and leaks a % 2^m (needs to be constant or random).
     """
+    if k == m:
+        return 0
     assert k > m
     assert int(program.options.ring) >= k
     from .types import sint, intbitint, cint, cgf2n

diff --git a/Compiler/dijkstra.py b/Compiler/dijkstra.py
@@ -103,7 +103,7 @@ def bubble_up(self, start):
         childpos = MemValue(start * shift)
         @for_range(self.levels - 1)
         def f(i):
-            parentpos = childpos.right_shift(1, self.levels)
+            parentpos = childpos.right_shift(1, self.levels + 1)
             parent, parent_state = self.heap.read_and_maybe_remove(parentpos)
             child, child_state = self.heap.read_and_maybe_remove(childpos)
             swap = parent > child

diff --git a/Compiler/floatingpoint.py b/Compiler/floatingpoint.py
@@ -1,3 +1,4 @@
+import math
 from math import log, floor, ceil
 from Compiler.instructions import *
 from . import types
@@ -411,6 +412,8 @@ def TruncInRing(to_shift, l, pow2m):
     return types.sint.bit_compose(reversed(bits))
 
 def SplitInRing(a, l, m):
+    if l == 1:
+        return m.if_else(a, 0), m.if_else(0, a), 1
     pow2m = Pow2(m, l, None)
     upper = TruncInRing(a, l, pow2m)
     lower = a - upper * pow2m
@@ -620,27 +623,36 @@ def BITLT(a, b, bit_length):
 def BitDecFull(a):
     from .library import get_program, do_while, if_, break_point
     from .types import sint, regint, longint
-    p=int(get_program().options.prime)
+    p = get_program().prime
     assert p
     bit_length = p.bit_length()
-    bbits = [sint(size=a.size) for i in range(bit_length)]
-    tbits = [[sint(size=1) for i in range(bit_length)] for j in range(a.size)]
-    pbits = util.bit_decompose(p)
-    # Loop until we get some random integers less than p
-    done = [regint(0) for i in range(a.size)]
-    @do_while
-    def get_bits_loop():
+    logp = int(round(math.log(p, 2)))
+    if abs(p - 2 ** logp) / p < 2 ** -get_program().security:
+        # inspired by Rabbit (https://eprint.iacr.org/2021/119)
+        # no need for exact randomness generation
+        # if modulo a power of two is close enough
+        bbits = [sint.get_random_bit(size=a.size) for i in range(logp)]
+        if logp != bit_length:
+            bbits += [sint(0, size=a.size)]
+    else:
+        bbits = [sint(size=a.size) for i in range(bit_length)]
+        tbits = [[sint(size=1) for i in range(bit_length)] for j in range(a.size)]
+        pbits = util.bit_decompose(p)
+        # Loop until we get some random integers less than p
+        done = [regint(0) for i in range(a.size)]
+        @do_while
+        def get_bits_loop():
+            for j in range(a.size):
+                @if_(done[j] == 0)
+                def _():
+                    for i in range(bit_length):
+                        tbits[j][i].link(sint.get_random_bit())
+                    c = regint(BITLT(tbits[j], pbits, bit_length).reveal())
+                    done[j].link(c)
+            return (sum(done) != a.size)
         for j in range(a.size):
-            @if_(done[j] == 0)
-            def _():
-                for i in range(bit_length):
-                    tbits[j][i].link(sint.get_random_bit())
-                c = regint(BITLT(tbits[j], pbits, bit_length).reveal())
-                done[j].link(c)
-        return (sum(done) != a.size)
-    for j in range(a.size):
-        for i in range(bit_length):
-            movs(bbits[i][j], tbits[j][i])
+            for i in range(bit_length):
+                movs(bbits[i][j], tbits[j][i])
     b = sint.bit_compose(bbits)
     c = (a-b).reveal()
     t = (p-c).bit_decompose(bit_length)

diff --git a/Compiler/instructions.py b/Compiler/instructions.py
@@ -1577,19 +1577,6 @@ class writesocketc(base.IOInstruction):
     def has_var_args(self):
         return True
 
-@base.vectorize
-class writesockets(base.IOInstruction):
-    """
-    Write a variable number of secret shares + MACs from registers into a socket
-    for a specified client id, message_type
-    """
-    __slots__ = []
-    code = base.opcodes['WRITESOCKETS']
-    arg_format = tools.chain(['ci', 'int'], itertools.repeat('s'))
-
-    def has_var_args(self):
-        return True
-
 @base.vectorize
 class writesocketshare(base.IOInstruction):
     """ Write a variable number of shares (without MACs) from secret

diff --git a/Compiler/instructions_base.py b/Compiler/instructions_base.py
@@ -903,7 +903,7 @@ class DirectMemoryWriteInstruction(DirectMemoryInstruction, \
                                        WriteMemoryInstruction):
     __slots__ = []
     def __init__(self, *args, **kwargs):
-        if program.curr_tape.prevent_direct_memory_write:
+        if not program.curr_tape.singular:
             raise CompilerError('Direct memory writing prevented in threads')
         super(DirectMemoryWriteInstruction, self).__init__(*args, **kwargs)
 

diff --git a/Compiler/library.py b/Compiler/library.py
@@ -1062,14 +1062,14 @@ def f(i, j):
     """
     return for_range_multithread(n_threads, None, n_loops)
 
-def multithread(n_threads, n_items, max_size=None):
+def multithread(n_threads, n_items=None, max_size=None):
     """
     Distribute the computation of :py:obj:`n_items` to
     :py:obj:`n_threads` threads, but leave the in-thread repetition up
     to the user.
 
     :param n_threads: compile-time (int)
-    :param n_items: regint/cint/int
+    :param n_items: regint/cint/int (default: :py:obj:`n_threads`)
 
     The following executes ``f(0, 8)``, ``f(8, 8)``, and
     ``f(16, 9)`` in three different threads:
@@ -1080,6 +1080,8 @@ def multithread(n_threads, n_items, max_size=None):
         def f(base, size):
             ...
     """
+    if n_items is None:
+        n_items = n_threads
     if max_size is None:
         return map_reduce(n_threads, None, n_items, initializer=lambda: [],
                           reducer=None, looping=False)

diff --git a/Compiler/ml.py b/Compiler/ml.py
@@ -703,6 +703,9 @@ def _(i):
         progress('f input')
 
     def forward(self, batch=None):
+        if batch is None:
+            batch = regint.Array(self.N)
+            batch.assign(regint.inc(self.N))
         self.compute_f_input(batch=batch)
         if self.activation_layer:
             self.activation_layer.forward(batch)

diff --git a/Compiler/oram.py b/Compiler/oram.py
@@ -91,7 +91,11 @@ def get_slice(self):
             for length,start in zip(self.lengths[:-1],series(self.lengths)):
                 res.append(remainder.mod2m(length, total_length - start, False))
                 remainder -= res[-1]
-                remainder /= floatingpoint.two_power(length)
+                if Program.prog.options.ring:
+                    remainder = remainder.trunc_zeros(length,
+                                                      total_length - start, False)
+                else:
+                    remainder /= floatingpoint.two_power(length)
             res.append(remainder)
             return res
     def set_slice(self, value):
@@ -1498,12 +1502,12 @@ def translate_index(self, index):
             rem = mod2m(index, self.log_entries_per_block, log2(self.size), False)
             c = mod2m(rem, self.log_entries_per_element, \
                           self.log_entries_per_block, False)
-            b = (rem - c).trunc_zeros(self.log_entries_per_element,
+            b = trunc_zeros(rem - c, self.log_entries_per_element,
                                       self.log_entries_per_block)
             if self.small:
                 return 0, b, c
             else:
-                return (index - rem).trunc_zeros(self.log_entries_per_block,
+                return trunc_zeros(index - rem, self.log_entries_per_block,
                                                  log2(self.size)), b, c
         else:
             index_bits = bit_decompose(index, log2(self.size))